# Домашнее задание  № 5. Матричные разложения/Тематическое моделирование

### Задание № 1 (4 балла)

Попробуйте матричные разложения с 5 классификаторами - SGDClassifier, KNeighborsClassifier, MultinomialNB, RandomForest, ExtraTreesClassifier (про него подробнее почитайте в документации, он похож на RF). Используйте и NMF и SVD. Сравните результаты на кросс-валидации и выберите лучшее сочетание.

В итоге у вас должно получиться, как минимум 10 моделей (два разложения на каждый классификатор). Используйте 1 и те же параметры кросс-валидации. Параметры векторизации, параметры K в матричных разложениях, параметры классификаторов могут быть разными между экспериментами.

Можете взять поменьше данных, если все будет обучаться слишком долго (не ставьте параметр K слишком большим в NMF, иначе точно будет слишком долго)

In [100]:
import gensim
import pandas as pd
import numpy as np
from pymorphy2 import MorphAnalyzer
import pyLDAvis.gensim_models
from collections import Counter
from string import punctuation
from razdel import tokenize as razdel_tokenize
from IPython.display import Image
from IPython.core.display import HTML 
from sklearn.decomposition import TruncatedSVD, NMF, PCA, LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
morph = MorphAnalyzer()
warnings.filterwarnings("ignore")

In [101]:
data = pd.read_csv('avito_category_classification.csv')

In [102]:
data

Unnamed: 0,category_name,description
0,Автомобили,"отличное состояние,обслужиание в салоне"
1,Детская одежда и обувь,В отличном состоянии. Фирма KIKO. Очень теплый...
2,Предложение услуг,"Изготовление ограждений, перил,качелей, турник..."
3,Автомобили,Автомобиль в отличном техническом состоянии. О...
4,Бытовая техника,"Продается газовая плита ""Гефест"" (Белоруссия) ..."
...,...,...
9893,Товары для детей и игрушки,Чтобы посмотреть весь ассортимент нашего магаз...
9894,Детская одежда и обувь,"Весна,осень.74-80.вопросы можно в вайбер,двухс..."
9895,"Одежда, обувь, аксессуары","Кимоно Green Hill. Состояние отличное, рост ..."
9896,Детская одежда и обувь,Б/у кроссовки на девочку. Носили только в спор...


In [103]:
# добавим лемматизацию
def normalize(text):
    normalized_text = [word.text.strip(punctuation) for word \
                                                            in razdel_tokenize(text)]
    normalized_text = [word.lower() for word in normalized_text if word and len(word) < 20 ]
    normalized_text = [morph.parse(word)[0].normal_form for word in normalized_text]
    return ' '.join(normalized_text)

In [104]:
data['description_norm'] = data['description'].apply(normalize)

KeyboardInterrupt: 

In [None]:
vectorizer = CountVectorizer(min_df=5, max_df=0.5)

In [None]:
def eval_table(X, y, pipeline, N=6):
    # зафиксируем порядок классов
    labels = list(set(y))
    
    # метрики отдельных фолдов будет хранить в табличке
    fold_metrics = pd.DataFrame(index=labels)
    # дополнительно также соберем таблицу ошибок
    errors = np.zeros((len(labels), len(labels)))
    
    # создаем стратегию кросс-валидации
    # shuffle=True (перемешивание) - часто критично важно указать
    # т.к. данные могут быть упорядочены и модель на этом обучится
    kfold = StratifiedKFold(n_splits=N, shuffle=True, )
    
    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        # fit-predict как и раньше, но сразу пайплайном
        pipeline.fit(X[train_index], y[train_index])
        preds = pipeline.predict(X[test_index])
        
        # записываем метрику и индекс фолда
        fold_metrics[f'precision_{i}'] = precision_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'recall_{i}'] = recall_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'f1_{i}'] = f1_score(y[test_index], preds, labels=labels, average=None)
        errors += confusion_matrix(y[test_index], preds, labels=labels, normalize='true')
    
    # таблица для усредненных значений
    # тут мы берем колонки со значениями и усредняем их
    # часто также все метрики сразу суммируют и в конце просто делят на количество фолдов
    # но мы тут помимо среднего также хотим посмотреть на стандартное отклонение
    # чтобы понять как сильно варьируются оценки моделей
    result = pd.DataFrame(index=labels)
    result['precision'] = fold_metrics[[f'precision_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['precision_std'] = fold_metrics[[f'precision_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['recall'] = fold_metrics[[f'recall_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['recall_std'] = fold_metrics[[f'recall_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['f1'] = fold_metrics[[f'f1_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['f1_std'] = fold_metrics[[f'f1_{i}' for i in range(N)]].std(axis=1).round(2)
    
    # добавим одну колонку со средним по всем классам
    result.loc['mean'] = result.mean().round(2)
    # проценты ошибок просто усредняем
    errors /= N
    
    return result, errors

### SGDClassifier

In [None]:
pipeline_nmf = Pipeline([
    ('bow', vectorizer),
    ('decomposition', NMF(50)),
    ('scaler', StandardScaler()),
    ('clf', SGDClassifier())
])

In [None]:
metrics_nmf_sgd, errors_nmf_sgd = eval_table(data['description_norm'], data['category_name'], pipeline_nmf)
metrics_nmf_sgd


Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.79,0.07,0.74,0.07,0.76,0.01
Телефоны,0.73,0.12,0.66,0.07,0.68,0.05
"Одежда, обувь, аксессуары",0.57,0.06,0.64,0.08,0.6,0.02
Предложение услуг,0.61,0.04,0.68,0.09,0.64,0.05
Квартиры,0.95,0.03,0.93,0.03,0.94,0.01
Мебель и интерьер,0.4,0.11,0.2,0.05,0.26,0.07
Бытовая техника,0.24,0.1,0.14,0.08,0.18,0.09
Ремонт и строительство,0.44,0.07,0.36,0.11,0.38,0.08
Детская одежда и обувь,0.53,0.03,0.59,0.07,0.56,0.03
Товары для детей и игрушки,0.55,0.1,0.49,0.06,0.51,0.06


In [None]:
pipeline_svd = Pipeline([
    ('bow', vectorizer),
    ('svd', TruncatedSVD(500)),
    ('scaler', StandardScaler()),
    ('clf', SGDClassifier())
])

In [None]:
metrics_svd_sgd, errors_svd_sgd = eval_table(data['description_norm'], data['category_name'], pipeline_svd)
metrics_svd_sgd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.88,0.04,0.77,0.05,0.82,0.02
Телефоны,0.82,0.02,0.69,0.06,0.75,0.03
"Одежда, обувь, аксессуары",0.65,0.03,0.78,0.03,0.71,0.01
Предложение услуг,0.77,0.05,0.63,0.06,0.69,0.05
Квартиры,0.98,0.01,0.82,0.03,0.89,0.02
Мебель и интерьер,0.67,0.05,0.58,0.05,0.62,0.05
Бытовая техника,0.56,0.09,0.36,0.11,0.43,0.11
Ремонт и строительство,0.53,0.05,0.49,0.05,0.51,0.04
Детская одежда и обувь,0.72,0.02,0.77,0.03,0.74,0.01
Товары для детей и игрушки,0.66,0.03,0.66,0.05,0.66,0.03


### KNeighborsClassifier

In [None]:
pipeline_nmf = Pipeline([
    ('bow', vectorizer),
    ('decomposition', NMF(50)),
    ('clf', KNeighborsClassifier(n_neighbors=3))
])

metrics_nmf_knn, errors_nmf_knn = eval_table(data['description_norm'], data['category_name'], pipeline_nmf)
metrics_nmf_knn

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.45,0.04,0.67,0.05,0.54,0.04
Телефоны,0.73,0.1,0.34,0.08,0.46,0.09
"Одежда, обувь, аксессуары",0.5,0.03,0.51,0.03,0.51,0.03
Предложение услуг,0.61,0.07,0.51,0.03,0.56,0.04
Квартиры,0.89,0.02,0.83,0.04,0.86,0.03
Мебель и интерьер,0.25,0.07,0.2,0.06,0.22,0.06
Бытовая техника,0.14,0.04,0.24,0.06,0.18,0.04
Ремонт и строительство,0.38,0.09,0.17,0.05,0.23,0.07
Детская одежда и обувь,0.43,0.02,0.57,0.02,0.49,0.02
Товары для детей и игрушки,0.58,0.04,0.27,0.06,0.37,0.06


In [None]:
pipeline_svd = Pipeline([
    ('bow', vectorizer),
    ('svd', TruncatedSVD(500)),
    ('clf', KNeighborsClassifier(n_neighbors=3))
])

metrics_svd_knn, errors_svd_knn = eval_table(data['description_norm'], data['category_name'], pipeline_svd)
metrics_svd_knn

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.48,0.06,0.68,0.05,0.56,0.05
Телефоны,0.75,0.08,0.36,0.03,0.48,0.04
"Одежда, обувь, аксессуары",0.54,0.02,0.54,0.04,0.54,0.03
Предложение услуг,0.74,0.05,0.47,0.04,0.57,0.03
Квартиры,0.87,0.05,0.77,0.06,0.82,0.04
Мебель и интерьер,0.39,0.07,0.33,0.04,0.36,0.05
Бытовая техника,0.19,0.02,0.35,0.05,0.24,0.03
Ремонт и строительство,0.37,0.06,0.18,0.05,0.24,0.05
Детская одежда и обувь,0.48,0.02,0.67,0.03,0.56,0.01
Товары для детей и игрушки,0.6,0.08,0.29,0.03,0.39,0.05


### RandomForest

In [None]:
pipeline_nmf = Pipeline([
    ('bow', vectorizer),
    ('decomposition', NMF(50)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=1))
])

metrics_nmf_rf, errors_nmf_rf = eval_table(data['description_norm'], data['category_name'], pipeline_nmf)
metrics_nmf_rf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.0,0.0,0.0,0.0,0.0,0.0
Телефоны,0.0,0.0,0.0,0.0,0.0,0.0
"Одежда, обувь, аксессуары",0.26,0.03,0.7,0.06,0.38,0.04
Предложение услуг,0.0,0.0,0.0,0.0,0.0,0.0
Квартиры,0.89,0.05,0.65,0.18,0.74,0.12
Мебель и интерьер,0.0,0.0,0.0,0.0,0.0,0.0
Бытовая техника,0.0,0.0,0.0,0.0,0.0,0.0
Ремонт и строительство,0.0,0.0,0.0,0.0,0.0,0.0
Детская одежда и обувь,0.46,0.05,0.51,0.06,0.48,0.04
Товары для детей и игрушки,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
pipeline_svd = Pipeline([
    ('bow', vectorizer),
    ('svd', TruncatedSVD(500)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=1))
])

metrics_svd_knn, errors_svd_knn = eval_table(data['description_norm'], data['category_name'], pipeline_svd)
metrics_svd_knn

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.0,0.0,0.0,0.0,0.0,0.0
Телефоны,0.0,0.0,0.0,0.0,0.0,0.0
"Одежда, обувь, аксессуары",0.32,0.02,0.83,0.04,0.46,0.02
Предложение услуг,0.0,0.0,0.0,0.0,0.0,0.0
Квартиры,0.5,0.55,0.05,0.09,0.08,0.14
Мебель и интерьер,0.0,0.0,0.0,0.0,0.0,0.0
Бытовая техника,0.0,0.0,0.0,0.0,0.0,0.0
Ремонт и строительство,0.0,0.0,0.0,0.0,0.0,0.0
Детская одежда и обувь,0.29,0.03,0.42,0.07,0.34,0.04
Товары для детей и игрушки,0.0,0.0,0.0,0.0,0.0,0.0


### ExtraTreesClassifier

In [None]:
pipeline_nmf = Pipeline([
    ('bow', vectorizer),
    ('decomposition', NMF(50)),
    ('clf', ExtraTreesClassifier(n_estimators=100, random_state=0))
])

metrics_nmf_extra_trees, errors_nmf_extra_trees = eval_table(data['description_norm'], data['category_name'], pipeline_nmf)
metrics_nmf_extra_trees

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.77,0.02,0.85,0.06,0.81,0.03
Телефоны,0.81,0.05,0.71,0.04,0.76,0.03
"Одежда, обувь, аксессуары",0.63,0.02,0.69,0.02,0.66,0.01
Предложение услуг,0.7,0.04,0.75,0.03,0.72,0.02
Квартиры,0.92,0.03,0.96,0.01,0.94,0.02
Мебель и интерьер,0.59,0.05,0.37,0.04,0.46,0.04
Бытовая техника,0.53,0.09,0.22,0.07,0.3,0.07
Ремонт и строительство,0.56,0.11,0.37,0.05,0.44,0.07
Детская одежда и обувь,0.61,0.01,0.73,0.02,0.66,0.01
Товары для детей и игрушки,0.67,0.03,0.53,0.05,0.59,0.04


In [None]:
pipeline_svd = Pipeline([
    ('bow', vectorizer),
    ('svd', TruncatedSVD(500)),
    ('clf', ExtraTreesClassifier(n_estimators=100, random_state=0))
])

metrics_svd_extra_trees, errors_svd_extra_trees = eval_table(data['description_norm'], data['category_name'], pipeline_svd)
metrics_svd_extra_trees

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.81,0.06,0.46,0.04,0.58,0.04
Телефоны,0.88,0.06,0.37,0.02,0.52,0.02
"Одежда, обувь, аксессуары",0.48,0.01,0.74,0.02,0.58,0.01
Предложение услуг,0.83,0.06,0.28,0.04,0.42,0.05
Квартиры,0.74,0.03,0.8,0.04,0.77,0.03
Мебель и интерьер,0.83,0.07,0.16,0.04,0.26,0.06
Бытовая техника,0.67,0.12,0.15,0.04,0.24,0.05
Ремонт и строительство,0.59,0.1,0.08,0.01,0.14,0.02
Детская одежда и обувь,0.46,0.01,0.74,0.03,0.57,0.02
Товары для детей и игрушки,0.68,0.06,0.19,0.02,0.3,0.03


**Лучший результат показала комбинация SVD и SGDClassifier: mean f1 score 0.68**

# Задание № 2 (6 баллов)

В Gensim тоже можно добавить нграммы и tfidf. Постройте 1 модель без них (как в семинаре) и еще 3 модели (1 с нграммами, 1 с tfidf и 1 с нграммами и с tfidf). Сранивте качество с помощью метрик (перплексия, когерентность) и на глаз. Определите лучшую модель. Для каждой модели выберите 1 самую красивую на ваш взгляд тему.

Используйте данные википедии из семинара. Можете взять поменьше данных, если все обучается долго.

Важное требование - получившиеся модели не должны быть совсем плохими. Если хороших тем не получается, попробуйте настроить гиперпараметры, отфильтровать словарь по-другому. 

In [None]:
texts = open('wiki_data.txt').read().splitlines()[:1000]
texts = ([normalize(text) for text in texts])

### Model 1 (No TFIDF, no ngramms)

In [None]:
dictionary1 = gensim.corpora.Dictionary((text.split() for text in texts))

dictionary1.filter_extremes(no_above=0.1, no_below=10)
dictionary1.compactify()

In [None]:
corpus = [dictionary1.doc2bow(text.split()) for text in texts]

In [None]:
lda = gensim.models.LdaMulticore(corpus, 
                                 100,
                                 alpha='asymmetric',
                                 id2word=dictionary1, 
                                 passes=10) 

In [None]:
lda.print_topics()

[(99,
  '0.068*"улица" + 0.065*"волость" + 0.031*"уезд" + 0.031*"губерния" + 0.019*"брянский" + 0.015*"переулок" + 0.011*"калужский" + 0.008*"сельсовет" + 0.008*"далее" + 0.008*"фрэнсис"'),
 (98,
  '0.087*"уезд" + 0.038*"округ" + 0.032*"городской" + 0.016*"специальный" + 0.015*"образовать" + 0.013*"империя" + 0.013*"переименовать" + 0.012*"власть" + 0.011*"административный" + 0.011*"провинция"'),
 (97,
  '0.022*"музыкальный" + 0.021*"музыка" + 0.017*"’" + 0.015*"оркестр" + 0.015*"опера" + 0.012*"язык" + 0.011*"азербайджанский" + 0.009*"театр" + 0.009*"народный" + 0.008*"композитор"'),
 (96,
  '0.036*"граф" + 0.031*"де" + 0.030*"i" + 0.027*"ii" + 0.026*"сын" + 0.026*"король" + 0.017*"графство" + 0.013*"герцог" + 0.012*"дочь" + 0.012*"смерть"'),
 (95,
  '0.039*"департамент" + 0.028*"атлетика" + 0.020*"лёгкий" + 0.016*"провинция" + 0.014*"олимпиада" + 0.013*"соревнование" + 0.013*"аргентина" + 0.011*"жужевать" + 0.011*"спортсмен" + 0.011*"км²"'),
 (94,
  '0.023*"норвегия" + 0.017*"день" +

### Model 2 (Ngramms)

Нграммы добавляются вот так (перед созданиеv словаря)

In [None]:
texts = [text.split() for text in texts]
ph = gensim.models.Phrases(texts, scoring='npmi', threshold=0.4) # threshold можно подбирать
p = gensim.models.phrases.Phraser(ph)
ngrammed_texts = p[texts] 



# ! не забудьте, что далее вам нужно будет использовать ngrammed_texts

!! В модели с нграммами вначале посмотрите, что получается после преобразования
Если вы выведите несколько первых текстов в ngrammed_texts, то там должно быть что-то такое:

In [None]:
[text for text in ngrammed_texts[:3]]

[['новостройка',
  'нижегородский_область',
  'новостро́йка',
  '—',
  'сельский',
  'посёлок',
  'в',
  'дивеевский_район',
  'нижегородский_область',
  'входить',
  'в',
  'состав_сатисский',
  'сельсовет',
  'посёлок',
  'расположить',
  'в',
  '12,5',
  'км_к',
  'юг_от',
  'село_дивеево',
  'и',
  '1_км',
  'к_запад',
  'от',
  'город',
  'саров',
  'на',
  'право_берег',
  'река',
  'вичкинза',
  'правый_приток',
  'река',
  'сатис',
  'окружить',
  'смешанный',
  'лес',
  'соединить',
  'асфальтовый',
  'дорога',
  'с',
  'посёлок',
  'цыгановка',
  '1,5_км',
  'и',
  'грунтовый',
  'просёлочный',
  'дорога',
  'с',
  'посёлок_сатис',
  '3,5',
  'км',
  'название',
  'новостройка',
  'являться',
  'сугубо',
  'официальный',
  'местный_население',
  'использовать',
  'исключительно',
  'альтернативный',
  'название',
  '—',
  'хитрый',
  'употребляться_языковой',
  'оборот',
  '«',
  '…',
  'на',
  'хитрый',
  '»',
  'ранее',
  'использовать',
  'название',
  '—',
  'песчаный',
 

Если вы не видите нграммов, то попробуйте изменить параметр threshold

In [105]:
dictionary2 = gensim.corpora.Dictionary((ngrammed_texts))

dictionary2.filter_extremes(no_above=0.1, no_below=10)
dictionary2.compactify()

corpus2 = [dictionary2.doc2bow(text) for text in ngrammed_texts]

In [106]:
lda_ngramms = gensim.models.LdaMulticore(corpus2, 
                                 100,
                                 alpha='asymmetric',
                                 id2word=dictionary2, 
                                 passes=10) 

lda_ngramms.print_topics()

[(99,
  '0.020*"матч" + 0.019*"сборная" + 0.015*"команда" + 0.012*"турнир" + 0.011*"клуб" + 0.010*"раз" + 0.010*"ирландия" + 0.010*"пройти" + 0.009*"выиграть" + 0.008*"чемпионат_мир"'),
 (98,
  '0.044*"корабль" + 0.022*"миссия" + 0.021*"двигатель" + 0.020*"станция" + 0.019*"полёт" + 0.015*"использоваться" + 0.014*"система" + 0.013*"около" + 0.013*"земля" + 0.012*"использовать"'),
 (97,
  '0.018*"клуб" + 0.018*"животное" + 0.015*"земля" + 0.012*"древний" + 0.012*"команда" + 0.011*"президент" + 0.010*"тренер" + 0.010*"народ" + 0.010*"сын" + 0.009*"существовать"'),
 (96,
  '0.023*"значение" + 0.019*"проект" + 0.019*"фестиваль" + 0.018*"городской" + 0.015*"пространство" + 0.015*"символ" + 0.015*"выставка" + 0.014*"сельсовет" + 0.014*"мы" + 0.013*"создать"'),
 (95,
  '0.081*"клуб" + 0.028*"матч" + 0.026*"контракт" + 0.024*"сезон" + 0.023*"перейти" + 0.019*"сборная" + 0.019*"г" + 0.019*"кнр" + 0.018*"команда" + 0.018*"провести"'),
 (94,
  '0.029*"система" + 0.027*"рыба" + 0.013*"разработать"

### Model 3 (Ngramms + TFIDF)

In [107]:
tfidf = gensim.models.TfidfModel(corpus, id2word=dictionary2, )
corpus3 = tfidf[corpus2]

In [108]:
lda_ngramms_tfidf = gensim.models.LdaMulticore(corpus3, 
                                 100,
                                 alpha='asymmetric',
                                 id2word=dictionary2, 
                                 passes=10) 

lda_ngramms_tfidf.print_topics()

[(99,
  '0.000*"священник" + 0.000*"символ" + 0.000*"религиозный" + 0.000*"решение" + 0.000*"рим" + 0.000*"рядом" + 0.000*"самостоятельный" + 0.000*"свой_очередь" + 0.000*"расположение" + 0.000*"северный_часть"'),
 (98,
  '0.000*"священник" + 0.000*"символ" + 0.000*"религиозный" + 0.000*"решение" + 0.000*"рим" + 0.000*"рядом" + 0.000*"самостоятельный" + 0.000*"свой_очередь" + 0.000*"расположение" + 0.000*"северный_часть"'),
 (96,
  '0.000*"священник" + 0.000*"символ" + 0.000*"религиозный" + 0.000*"решение" + 0.000*"рим" + 0.000*"рядом" + 0.000*"самостоятельный" + 0.000*"свой_очередь" + 0.000*"расположение" + 0.000*"северный_часть"'),
 (97,
  '0.000*"священник" + 0.000*"символ" + 0.000*"религиозный" + 0.000*"решение" + 0.000*"рим" + 0.000*"рядом" + 0.000*"самостоятельный" + 0.000*"свой_очередь" + 0.000*"расположение" + 0.000*"северный_часть"'),
 (94,
  '0.001*"заместитель_председатель" + 0.001*"народный" + 0.001*"собрание" + 0.001*"присвоить_звание" + 0.001*"1900" + 0.001*"японский" + 0

### Model 4 (TFIDF)

In [110]:
tfidf = gensim.models.TfidfModel(corpus, id2word=dictionary1, )
corpus4 = tfidf[corpus]

In [111]:
lda_tfidf = gensim.models.LdaMulticore(corpus4, 
                                 100,
                                 alpha='asymmetric',
                                 id2word=dictionary1, 
                                 passes=10) 

lda_tfidf.print_topics()

[(99,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (98,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (97,
  '0.001*"линия" + 0.000*"сеть" + 0.000*"городской" + 0.000*"маршрут" + 0.000*"система" + 0.000*"участок" + 0.000*"20" + 0.000*"улица" + 0.000*"продлить" + 0.000*"путь"'),
 (96,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (95,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (94,
  '0.000*"конечный" + 0.000*"кру

### Подсчет метрик

In [112]:
# lda perplexity

np.exp2(-lda.log_perplexity(corpus[:1000]))

211.43875715212678

In [117]:
# lda coherence

topics = []
for topic_id, topic in lda.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=texts, 
                                                   dictionary=dictionary1, coherence='c_v')

coherence_model_lda.get_coherence()

0.4878013870543314

In [118]:
# lda_ngramms perplexity

np.exp2(-lda_ngramms.log_perplexity(corpus[:1000]))

882.6540146239026

In [119]:
# lda_ngramms coherence (почему-то возвращается nan)

topics = []
for topic_id, topic in lda_ngramms.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=texts, 
                                                   dictionary=dictionary2, coherence='c_v')

coherence_model_lda.get_coherence()


nan

In [120]:
# lda_ngramms_tfidf perplexity

np.exp2(-lda_ngramms_tfidf.log_perplexity(corpus[:1000]))

64891.295506338705

In [121]:
# lda_ngramms_tfidf coherence (и тут снова nan)

topics = []
for topic_id, topic in lda_ngramms_tfidf.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=texts, 
                                                   dictionary=dictionary2, coherence='c_v')

coherence_model_lda.get_coherence()

nan

In [122]:
# lda_tfidf perplexity

np.exp2(-lda_tfidf.log_perplexity(corpus[:1000]))

529.5589131383166

In [123]:
# lda_tfidf coherence

topics = []
for topic_id, topic in lda_tfidf.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=texts, 
                                                   dictionary=dictionary1, coherence='c_v')

coherence_model_lda.get_coherence()

0.43396947204960873

**Выходит, наименьшая перплексия из всех 4-х моделей у самой первой без TFIDF и без нграммов (но скорее всего, здесь что-то не так). Когерентность (из тех, которе удалось посчитать) лучшая у модели 4 с TFIDF, хотя разница небольшая.**