In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, Phrases

import spacy
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=16)

# import warnings
# warnings.filterwarnings(action='once')

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


#### Стоит удалить слова-цифры, которые я добавлял вместо обычных numeric токенов. Для этого скопирую список всех слов, которые были вставлены в датасет вместо цифр

In [2]:
ONES_FEMININE = {
    1: ('одна',),2: ('две',),3: ('три',),
    4: ('четыре',),5: ('пять',),6: ('шесть',),
    7: ('семь',),8: ('восемь',),9: ('девять',),
}

ONES = {0: ('ноль',),
    1: ('один',),2: ('два',),3: ('три',),
    4: ('четыре',),5: ('пять',),6: ('шесть',),
    7: ('семь',),8: ('восемь',),9: ('девять',),
}

TENS = {
    0: ('десять',),1: ('одиннадцать',),2: ('двенадцать',),
    3: ('тринадцать',),4: ('четырнадцать',),5: ('пятнадцать',),
    6: ('шестнадцать',),7: ('семнадцать',),8: ('восемнадцать',),
    9: ('девятнадцать',),
}

TWENTIES = {
    2: ('двадцать',),3: ('тридцать',),4: ('сорок',),
    5: ('пятьдесят',),6: ('шестьдесят',),7: ('семьдесят',),
    8: ('восемьдесят',),9: ('девяносто',),
}

HUNDREDS = {
    1: ('сто',),2: ('двести',), 3: ('триста',),
    4: ('четыреста',),5: ('пятьсот',),6: ('шестьсот',),
    7: ('семьсот',),8: ('восемьсот',),9: ('девятьсот',),
}

THOUSANDS = {
    1: ('тысяча', 'тысячи', 'тысяч'),  # 10^3
    2: ('миллион', 'миллиона', 'миллионов'),  # 10^6
    3: ('миллиард', 'миллиарда', 'миллиардов'),  # 10^9
    4: ('триллион', 'триллиона', 'триллионов'),  # 10^12
    5: ('квадриллион', 'квадриллиона', 'квадриллионов'),  # 10^15
    6: ('квинтиллион', 'квинтиллиона', 'квинтиллионов'),  # 10^18
    7: ('секстиллион', 'секстиллиона', 'секстиллионов'),  # 10^21
    8: ('септиллион', 'септиллиона', 'септиллионов'),  # 10^24
    9: ('октиллион', 'октиллиона', 'октиллионов'),  # 10^27
    10: ('нониллион', 'нониллиона', 'нониллионов'),  # 10^30
}

Несколько базовых функция для процессинга текста:

In [23]:
# удаляем числовые токены (иначе слова "тысяча", "девятсот" и подобные будут занимать весь топ самых важных слов)

combined = [ONES, ONES_FEMININE, TENS, TWENTIES, HUNDREDS, THOUSANDS]

numeric = []
for dic in combined:
    for tup in dic.values():
        for string in tup:
            numeric.append(string)
            
def remove_numeric(text):
    return [token for token in text if token not in numeric]

def gen_words(text):
    text = gensim.utils.simple_preprocess(text, deacc=True)
    return remove_numeric(text)

In [None]:
# Удаляем стоп слова, и мусор короче 2 букв, который в стоп слова не попал

stopwords = stopwords.words('russian')
stopwords.append('это')
stopwords.append('год')

def clear_stopwords(text):
    return [word for word in text if word not in stopwords and len(word) > 2]

#### Baseline model:

In [72]:
df = pd.read_csv('/home/sergey/Python_projects/RU_NER/Project_Data/Data/train_data.csv')

In [73]:
df.drop('augmented_tokens', axis=1, inplace=True)
df.drop('augmented_ner_tags', axis=1, inplace=True)

In [6]:
nlp = spacy.load("ru_core_news_lg", disable=['parser', 'ner'])

In [75]:
df.head(3)

Unnamed: 0,sentence
0,последний был разбит при сауле жемайтами и зем...
1,научным руководителем был в и арнольд
2,но как же нынче выдают замуж княгиня ни от ког...


In [76]:
def lemmatization(text):
    doc = nlp(text)
    new_text = []
    for token in doc:
        new_text.append(token.lemma_)
    return " ".join(new_text)
        

In [77]:
df['spacy'] = df['sentence'].parallel_apply(lambda x: lemmatization(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12857), Label(value='0 / 12857')))…

In [79]:
df['gensim_words'] = df['spacy'].parallel_apply(lambda x: gen_words(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12857), Label(value='0 / 12857')))…

In [80]:
df.gensim_words = df.gensim_words.parallel_apply(lambda x: remove_numeric(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12857), Label(value='0 / 12857')))…

In [81]:
dictionary = corpora.Dictionary(df['gensim_words'])
corpus = [dictionary.doc2bow(doc) for doc in df['gensim_words']]

In [82]:
lda_model_baseline = gensim.models.ldamulticore.LdaMulticore(
    workers=16,
    corpus = corpus,
    id2word = dictionary,
    num_topics = 15,
    random_state = 13,
    eval_every = 2,
    chunksize = 2000,
    passes = 10,
)

In [83]:
def compute_coherence(model, texts, id2word, coherence='c_v'):
    coherence_model_lda = CoherenceModel(model=model,
                                         texts=texts,
                                         dictionary=id2word,
                                         coherence=coherence)
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: ', coherence_lda)

In [84]:
compute_coherence(lda_model_baseline, df['gensim_words'], dictionary)

Coherence Score:  0.3779725303117619


In [85]:
vis=pyLDAvis.gensim.prepare(lda_model_baseline, corpus, dictionary, mds='mmds', R=15)

In [86]:
pyLDAvis.save_html(vis, 'LDAvis/baseline.html')

In [87]:
lda_model_baseline.print_topics()

[(0,
  '0.015*"он" + 0.013*"на" + 0.010*"год" + 0.008*"для" + 0.006*"работать" + 0.005*"что" + 0.005*"время" + 0.005*"быть" + 0.005*"министр" + 0.005*"союз"'),
 (1,
  '0.023*"год" + 0.020*"город" + 0.016*"по" + 0.013*"на" + 0.008*"центр" + 0.007*"население" + 0.007*"входить" + 0.006*"он" + 0.006*"сша" + 0.006*"после"'),
 (2,
  '0.016*"на" + 0.013*"год" + 0.010*"для" + 0.008*"были" + 0.007*"он" + 0.006*"быть" + 0.006*"конец" + 0.006*"век" + 0.006*"индия" + 0.006*"из"'),
 (3,
  '0.037*"не" + 0.026*"что" + 0.016*"это" + 0.013*"этот" + 0.011*"мы" + 0.010*"на" + 0.010*"он" + 0.010*"быть" + 0.009*"но" + 0.008*"весь"'),
 (4,
  '0.018*"на" + 0.018*"вид" + 0.015*"из" + 0.013*"род" + 0.011*"как" + 0.010*"семеиство" + 0.010*"или" + 0.007*"растение" + 0.007*"также" + 0.006*"что"'),
 (5,
  '0.025*"город" + 0.024*"остров" + 0.018*"на" + 0.014*"штат" + 0.013*"раион" + 0.012*"река" + 0.012*"километр" + 0.009*"область" + 0.009*"из" + 0.009*"центр"'),
 (6,
  '0.043*"год" + 0.016*"быть" + 0.014*"на" + 0.

Функция для экспериментов с параметрами:

In [58]:
def train_compute_coherence_save_chart(name, corpus, dictionary, k, a, b):
    """
    1) Train Model
    2) Compute Coherence score
    3) Create and save visualization
    """
    
    lda_model = gensim.models.ldamulticore.LdaMulticore(workers=16,
                                                        corpus = corpus,
                                                        id2word = dictionary,
                                                        num_topics = k,
                                                        random_state = 13,
                                                        eval_every = 2,
                                                        chunksize = 2000,
                                                        alpha=a,
                                                        eta=b,
                                                        passes = 20)
    
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=df['gensim_words'],
                                         dictionary=dictionary,
                                         coherence='c_v')
    
    vis=pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, mds='mmds', R=k)
    pyLDAvis.save_html(vis, f'LDAvis/{name}.html')
    print(coherence_model_lda.get_coherence())
    return lda_model, coherence_model_lda.get_coherence()

___
#### Adjusting lemmatization

In [90]:
df = pd.read_csv('/home/sergey/Python_projects/RU_NER/Project_Data/Data/train_data.csv')

In [91]:
df.drop('augmented_tokens', axis=1, inplace=True)
df.drop('augmented_ner_tags', axis=1, inplace=True)

In [24]:
# Попробуем оставить только определенные части речи

def lemmatization(text, allowed_postages=['NOUN', 'ADJ', 'VERB', 'ADV']):
    doc = nlp(text)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postages:
            new_text.append(token.lemma_)
    return " ".join(new_text)

In [93]:
df['spacy'] = df['sentence'].parallel_apply(lambda x: lemmatization(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12857), Label(value='0 / 12857')))…

In [94]:
df['gensim_words'] = df['spacy'].parallel_apply(lambda x: gen_words(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12857), Label(value='0 / 12857')))…

In [95]:
# df.to_csv('lemmatized_for_topics.csv', index=False)

In [37]:
# df['gensim_words'] = df['gensim_words'].parallel_apply(literal_eval)

KeyError: 'gensim_words'

In [96]:
dictionary = corpora.Dictionary(df['gensim_words'])
corpus = [dictionary.doc2bow(doc) for doc in df['gensim_words']]

In [97]:
adjusted_lemmatization_coherence = train_compute_coherence_save_chart('adjusted_lemm',
                                                                      corpus, dictionary, 15, 1, 1)

0.35893649478202744


In [98]:
adjusted_lemmatization_coherence 

0.35893649478202744

___
#### Bigrams + Trigrams and stopwords removal


In [8]:
df = pd.read_csv('/home/sergey/Python_projects/RU_NER/Project_Data/Data/train_data.csv')

In [9]:
df.drop('augmented_tokens', axis=1, inplace=True)
df.drop('augmented_ner_tags', axis=1, inplace=True)

In [10]:
def lemmatization(text, allowed_postages=['NOUN', 'ADJ', 'VERB', 'ADV']):
    doc = nlp(text)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postages:
            new_text.append(token.lemma_)
    return " ".join(new_text)

In [11]:
#  Пробовал использовать стандартную лемматизцию, результат был хуже
#def lemmatization(text):
#     doc = nlp(text)
#     new_text = []
#     for token in doc:
#         new_text.append(token.lemma_)
#     return " ".join(new_text)

In [12]:
df['spacy'] = df['sentence'].parallel_apply(lambda x: lemmatization(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12857), Label(value='0 / 12857')))…

In [17]:
df['gensim_words'] = df['spacy'].parallel_apply(lambda x: gen_words(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12857), Label(value='0 / 12857')))…

In [25]:
bigram = gensim.models.Phrases(df['gensim_words'], min_count = 5, threshold=50)
trigram = gensim.models.Phrases(bigram[df['gensim_words']], threshold=50)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [26]:
df['gensim_words'] = df['gensim_words'].parallel_apply(lambda x: clear_stopwords(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12857), Label(value='0 / 12857')))…

In [27]:
def make_bi_trigrams(text):
    return trigram_mod[bigram_mod[text]]

In [28]:
df['gensim_words'] = df['gensim_words'].parallel_apply(lambda x: make_bi_trigrams(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12857), Label(value='0 / 12857')))…

In [113]:
# df.to_csv('ngrams.csv', index=False)

In [4]:
df = pd.read_csv('ngrams.csv')

In [6]:
df['gensim_words'] = df['gensim_words'].parallel_apply(literal_eval)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12857), Label(value='0 / 12857')))…

In [29]:
dictionary = corpora.Dictionary(df['gensim_words'])
corpus = [dictionary.doc2bow(doc) for doc in df['gensim_words']]

In [40]:
bi_and_trigras_coherence = train_compute_coherence_save_chart('ngrams_lemm',
                                                                      corpus, dictionary, 15, 0.01, 0.01)

0.34917137976291585


In [44]:
#with filter:
dictionary = corpora.Dictionary(df['gensim_words'])
dictionary.filter_extremes(no_below=30, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in df['gensim_words']]

In [42]:
len(dictionary)

4018

In [43]:
filter_extremes_coherence = train_compute_coherence_save_chart('filter_extremes_lemm',
                                                                      corpus, dictionary, 15, 0.01, 0.01)

0.3693857745486614


In [30]:
import tqdm

In [35]:
min_topics = 5
max_topics = 30
step_size = 5
topics_range = range(min_topics, max_topics, step_size)

In [36]:
alpha = list(np.arange(0.01, 1, 0.25))
alpha.append('symmetric')
alpha.append('asymmetric')

In [37]:
beta = list(np.arange(0.01, 1, 0.25))
beta.append('symmetric')

In [95]:
model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

In [103]:
def train_compute_coherence_save_chart(corpus, dictionary, k, a, b):
    """
    1) Train Model
    2) Compute Coherence score
    """
    
    lda_model = gensim.models.ldamulticore.LdaMulticore(workers=16,
                                                        corpus = corpus,
                                                        id2word = dictionary,
                                                        num_topics = k,
                                                        random_state = 13,
                                                        eval_every = 2,
                                                        chunksize = 2000,
                                                        alpha=a,
                                                        eta=b,
                                                        passes = 10)
    
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=df['gensim_words'],
                                         dictionary=dictionary,
                                         coherence='c_v')
    
#     vis=pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, mds='mmds', R=k)
#     pyLDAvis.save_html(vis, f'LDAvis/{name}.html')
    print(coherence_model_lda.get_coherence())
    return coherence_model_lda.get_coherence()

In [104]:
pbar.close()
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)))
    
    for k in topics_range:
        for a in alpha:
            for b in beta:
                cv = train_compute_coherence_save_chart(corpus=corpus, dictionary=dictionary, 
                                              k=k, a=a, b=b)
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)

                pbar.update(1)
    pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
    pbar.close()


  1%|▎                                        | 1/150 [01:09<2:51:23, 69.02s/it]
  1%|▎                                        | 1/150 [00:44<1:49:23, 44.05s/it]

0.3284127530364974


  1%|▌                                        | 2/150 [01:27<1:47:44, 43.68s/it]

0.3311698588279435


  2%|▊                                        | 3/150 [02:10<1:46:50, 43.61s/it]

0.3145429875603252


  3%|█                                        | 4/150 [02:55<1:46:38, 43.82s/it]

0.31191386140537236


  3%|█▎                                       | 5/150 [03:38<1:45:23, 43.61s/it]

0.33133845576180165


  4%|█▋                                       | 6/150 [04:32<1:53:24, 47.25s/it]

0.3025927373680425


  5%|█▉                                       | 7/150 [05:27<1:58:28, 49.71s/it]

0.30919215535108047


  5%|██▏                                      | 8/150 [06:22<2:01:20, 51.27s/it]

0.3061585903440559


  6%|██▍                                      | 9/150 [07:16<2:02:59, 52.34s/it]

0.3196603144077302


  7%|██▋                                     | 10/150 [08:11<2:03:42, 53.01s/it]

0.31893962234290957


  7%|██▉                                     | 11/150 [09:01<2:00:42, 52.10s/it]

0.24864859916929802


  8%|███▏                                    | 12/150 [09:52<1:59:26, 51.93s/it]

0.26026154352363945


  9%|███▍                                    | 13/150 [10:44<1:58:37, 51.95s/it]

0.2644441525933293


  9%|███▋                                    | 14/150 [11:37<1:58:18, 52.20s/it]

0.2729956388190304


 10%|████                                    | 15/150 [12:28<1:56:47, 51.90s/it]

0.27011568546546394


 11%|████▎                                   | 16/150 [13:13<1:50:51, 49.64s/it]

0.250682438239583


 11%|████▌                                   | 17/150 [13:58<1:47:10, 48.35s/it]

0.275931978049493


 12%|████▊                                   | 18/150 [14:44<1:45:03, 47.75s/it]

0.27904456382664444


 13%|█████                                   | 19/150 [15:31<1:43:44, 47.51s/it]

0.2879183892380706


 13%|█████▎                                  | 20/150 [16:17<1:41:43, 46.95s/it]

0.2695390458757745


 14%|█████▌                                  | 21/150 [17:07<1:43:10, 47.99s/it]

0.3217892450911809


 15%|█████▊                                  | 22/150 [17:57<1:43:26, 48.49s/it]

0.31352058444101816


 15%|██████▏                                 | 23/150 [18:48<1:44:15, 49.26s/it]

0.34229105302559315


 16%|██████▍                                 | 24/150 [19:39<1:44:20, 49.69s/it]

0.32942499114325513


 17%|██████▋                                 | 25/150 [20:29<1:43:53, 49.87s/it]

0.31215552999824403


 17%|██████▉                                 | 26/150 [21:20<1:43:54, 50.28s/it]

0.2602407544368841


 18%|███████▏                                | 27/150 [22:12<1:43:36, 50.54s/it]

0.2565268203061472


 19%|███████▍                                | 28/150 [23:02<1:42:30, 50.41s/it]

0.2602647486671445


 19%|███████▋                                | 29/150 [23:51<1:40:50, 50.00s/it]

0.26737017381012784


 20%|████████                                | 30/150 [24:40<1:39:36, 49.81s/it]

0.2719778785841248
0.3309748698053732


 21%|████████▎                               | 31/150 [25:24<1:35:19, 48.07s/it]

0.3885212208160236


 21%|████████▌                               | 32/150 [26:09<1:32:39, 47.12s/it]

0.40810309537723716


 22%|████████▊                               | 33/150 [26:53<1:30:09, 46.23s/it]

0.3948324479637359


 23%|█████████▎                              | 35/150 [28:26<1:28:30, 46.18s/it]

0.36627058870659823


 24%|█████████▌                              | 36/150 [29:19<1:31:47, 48.31s/it]

0.3026671233628974


 25%|█████████▊                              | 37/150 [30:13<1:34:04, 49.95s/it]

0.30266400661822807
0.3109156884551706


 26%|██████████▍                             | 39/150 [32:02<1:36:55, 52.39s/it]

0.2984952654143791
0.30611093743350337


 27%|██████████▋                             | 40/150 [32:55<1:36:24, 52.58s/it]

0.2494165047154718


 27%|██████████▉                             | 41/150 [33:43<1:32:48, 51.09s/it]

0.27435871226962927


 28%|███████████▏                            | 42/150 [34:29<1:29:23, 49.67s/it]

0.28375080668193986


 29%|███████████▍                            | 43/150 [35:17<1:27:23, 49.01s/it]

0.2942604761413334


 30%|████████████                            | 45/150 [36:51<1:23:45, 47.87s/it]

0.2618947289714389
0.2507732490861863


 31%|████████████▌                           | 47/150 [38:10<1:15:02, 43.72s/it]

0.2559512337725139
0.2649031611027256


 33%|█████████████                           | 49/150 [39:30<1:10:30, 41.89s/it]

0.2721218531280888
0.2563121730793202


 33%|█████████████▎                          | 50/150 [40:10<1:08:35, 41.16s/it]

0.3229903747878703


 35%|█████████████▊                          | 52/150 [41:41<1:10:41, 43.28s/it]

0.3147891925281264
0.3701266690997063


 35%|██████████████▏                         | 53/150 [42:26<1:11:05, 43.98s/it]

0.3944341798951175


 36%|██████████████▍                         | 54/150 [43:13<1:11:39, 44.79s/it]

0.3097304135147213


 37%|██████████████▋                         | 55/150 [43:58<1:11:14, 45.00s/it]

0.33054772443817393


 37%|██████████████▉                         | 56/150 [44:44<1:10:36, 45.07s/it]

0.3339577729055823


 39%|███████████████▍                        | 58/150 [46:13<1:08:57, 44.97s/it]

0.3140175632297051
0.2937548260792463


 39%|███████████████▋                        | 59/150 [46:59<1:08:43, 45.31s/it]

0.3187126179838337


 40%|████████████████                        | 60/150 [47:45<1:08:14, 45.49s/it]

0.3735489693357494


 41%|████████████████▎                       | 61/150 [48:30<1:07:15, 45.34s/it]

0.38046030522502833


 41%|████████████████▌                       | 62/150 [49:15<1:06:19, 45.23s/it]

0.3736904063731046


 42%|████████████████▊                       | 63/150 [50:00<1:05:16, 45.02s/it]

0.37496323626632927


 43%|█████████████████                       | 64/150 [50:46<1:04:55, 45.29s/it]

0.376636901149931


 43%|█████████████████▎                      | 65/150 [51:29<1:03:19, 44.70s/it]

0.29908674299761506


 44%|█████████████████▌                      | 66/150 [52:18<1:04:30, 46.07s/it]

0.31675408309299785


 45%|█████████████████▊                      | 67/150 [53:08<1:05:07, 47.08s/it]

0.3345933717874531


 45%|██████████████████▏                     | 68/150 [53:58<1:05:42, 48.08s/it]

0.3357385570828218


 46%|██████████████████▍                     | 69/150 [54:49<1:06:06, 48.97s/it]

0.308122784938407


 47%|██████████████████▋                     | 70/150 [55:38<1:05:15, 48.95s/it]

0.3187778097666511


 47%|██████████████████▉                     | 71/150 [56:20<1:01:39, 46.83s/it]

0.3525057665396249


 48%|████████████████████▏                     | 72/150 [57:03<59:24, 45.70s/it]

0.34698386874334736


 49%|████████████████████▍                     | 73/150 [57:47<58:03, 45.24s/it]

0.36436151719254856


 49%|████████████████████▋                     | 74/150 [58:32<56:56, 44.96s/it]

0.33429619819021406


 50%|█████████████████████                     | 75/150 [59:15<55:44, 44.60s/it]

0.3010207951096712


 51%|█████████████████████▎                    | 76/150 [59:52<52:04, 42.22s/it]

0.33420948145502993


 51%|████████████████████▌                   | 77/150 [1:00:27<48:46, 40.10s/it]

0.3684183102996299


 53%|█████████████████████                   | 79/150 [1:01:34<43:33, 36.80s/it]

0.3847891677771014
0.314988924449556


 53%|█████████████████████▎                  | 80/150 [1:02:11<42:59, 36.85s/it]

0.3702470260368452


 54%|█████████████████████▌                  | 81/150 [1:02:56<45:03, 39.18s/it]

0.3555741485666697


 55%|█████████████████████▊                  | 82/150 [1:03:40<45:57, 40.55s/it]

0.3879509338127896


 55%|██████████████████████▏                 | 83/150 [1:04:24<46:33, 41.69s/it]

0.37583662862080774


 56%|██████████████████████▍                 | 84/150 [1:05:08<46:44, 42.49s/it]

0.3622892145294803


 57%|██████████████████████▋                 | 85/150 [1:05:52<46:20, 42.78s/it]

0.35706820492738767


 57%|██████████████████████▉                 | 86/150 [1:06:35<45:36, 42.75s/it]

0.34255845670283463


 58%|███████████████████████▏                | 87/150 [1:07:18<45:02, 42.89s/it]

0.3552712234343636


 59%|███████████████████████▍                | 88/150 [1:08:01<44:30, 43.08s/it]

0.36099814341466907


 59%|███████████████████████▋                | 89/150 [1:08:45<44:06, 43.39s/it]

0.3503141161160022


 60%|████████████████████████                | 90/150 [1:09:28<43:10, 43.18s/it]

0.37610882098806064


 61%|████████████████████████▎               | 91/150 [1:10:11<42:18, 43.02s/it]

0.37531067042756905


 61%|████████████████████████▌               | 92/150 [1:10:55<41:48, 43.24s/it]

0.37945491283958704


 62%|████████████████████████▊               | 93/150 [1:11:39<41:22, 43.54s/it]

0.3695303515002359


 63%|█████████████████████████               | 94/150 [1:12:24<41:14, 44.19s/it]

0.3856382965117543


 63%|█████████████████████████▎              | 95/150 [1:13:09<40:39, 44.35s/it]

0.3224221050476124


 64%|█████████████████████████▌              | 96/150 [1:13:55<40:20, 44.82s/it]

0.3211777325764106


 65%|█████████████████████████▊              | 97/150 [1:14:42<40:15, 45.58s/it]

0.3155271460168415


 65%|██████████████████████████▏             | 98/150 [1:15:30<40:05, 46.26s/it]

0.3322988189671138


 66%|██████████████████████████▍             | 99/150 [1:16:19<40:00, 47.07s/it]

0.3179950730725163


 67%|██████████████████████████             | 100/150 [1:17:06<39:13, 47.07s/it]

0.33097111475545415


 67%|██████████████████████████▎            | 101/150 [1:17:48<37:09, 45.49s/it]

0.3504479453301176


 68%|██████████████████████████▌            | 102/150 [1:18:29<35:19, 44.15s/it]

0.3846121715582523


 69%|██████████████████████████▊            | 103/150 [1:19:10<33:54, 43.28s/it]

0.376831906263577


 69%|███████████████████████████            | 104/150 [1:19:52<32:41, 42.63s/it]

0.3359166240386278


 70%|███████████████████████████▎           | 105/150 [1:20:32<31:33, 42.07s/it]

0.31992664557842976


 71%|███████████████████████████▌           | 106/150 [1:21:04<28:38, 39.05s/it]

0.3766606666500683


 72%|████████████████████████████           | 108/150 [1:22:04<24:02, 34.34s/it]

0.3987171095341734


 73%|████████████████████████████▎          | 109/150 [1:22:33<22:23, 32.77s/it]

0.41321581804560975


 73%|████████████████████████████▌          | 110/150 [1:23:05<21:36, 32.41s/it]

0.3303393357449792
0.35342611558066905


 74%|████████████████████████████▊          | 111/150 [1:23:47<22:57, 35.33s/it]

0.3803760349204544


 75%|█████████████████████████████          | 112/150 [1:24:31<23:56, 37.81s/it]

0.3711541602466717


 75%|█████████████████████████████▍         | 113/150 [1:25:14<24:25, 39.61s/it]

0.37352977176969426


 76%|█████████████████████████████▋         | 114/150 [1:25:59<24:41, 41.17s/it]

0.36948973520830436


 77%|█████████████████████████████▉         | 115/150 [1:26:41<24:08, 41.38s/it]

0.340489488495356


 77%|██████████████████████████████▏        | 116/150 [1:27:22<23:26, 41.35s/it]

0.33593204265304083


 78%|██████████████████████████████▍        | 117/150 [1:28:04<22:46, 41.42s/it]

0.36630254558620645


 79%|██████████████████████████████▋        | 118/150 [1:28:46<22:09, 41.53s/it]

0.36244585729221024


 79%|██████████████████████████████▉        | 119/150 [1:29:29<21:40, 41.95s/it]

0.3347436960205046


 80%|███████████████████████████████▏       | 120/150 [1:30:10<20:57, 41.91s/it]

0.36538070769442377


 81%|███████████████████████████████▍       | 121/150 [1:30:53<20:21, 42.12s/it]

0.3823964624000667


 81%|███████████████████████████████▋       | 122/150 [1:31:36<19:46, 42.39s/it]

0.37242314005239785


 82%|███████████████████████████████▉       | 123/150 [1:32:20<19:14, 42.77s/it]

0.37862006082409766


 83%|████████████████████████████████▏      | 124/150 [1:33:06<18:57, 43.75s/it]

0.36898642867069276


 83%|████████████████████████████████▌      | 125/150 [1:33:48<18:05, 43.41s/it]

0.3162551686111023


 84%|████████████████████████████████▊      | 126/150 [1:34:33<17:33, 43.88s/it]

0.3162668096398033


 85%|█████████████████████████████████      | 127/150 [1:35:20<17:07, 44.68s/it]

0.30999393819175297


 85%|█████████████████████████████████▎     | 128/150 [1:36:06<16:33, 45.18s/it]

0.3073686829876191


 86%|█████████████████████████████████▌     | 129/150 [1:36:53<15:59, 45.69s/it]

0.3228541751750895


 87%|█████████████████████████████████▊     | 130/150 [1:37:41<15:27, 46.38s/it]

0.33286666842224705


 87%|██████████████████████████████████     | 131/150 [1:38:20<13:57, 44.09s/it]

0.3547688018785328


 88%|██████████████████████████████████▎    | 132/150 [1:39:00<12:54, 43.05s/it]

0.35950439398220996


 89%|██████████████████████████████████▌    | 133/150 [1:39:40<11:52, 41.93s/it]

0.3972720938918751


 89%|██████████████████████████████████▊    | 134/150 [1:40:19<10:56, 41.06s/it]

0.3421592055691467


 90%|███████████████████████████████████    | 135/150 [1:40:58<10:06, 40.44s/it]

0.3275921446704818


 91%|███████████████████████████████████▎   | 136/150 [1:41:29<08:46, 37.58s/it]

0.3795458928202338


 92%|███████████████████████████████████▉   | 138/150 [1:42:29<06:43, 33.66s/it]

0.42436382451104815
0.43536806107077375


 93%|████████████████████████████████████▍  | 140/150 [1:43:30<05:22, 32.26s/it]

0.32630072528156295
0.34809482658907503


 94%|████████████████████████████████████▋  | 141/150 [1:44:12<05:16, 35.22s/it]

0.3752849157586051


 95%|████████████████████████████████████▉  | 142/150 [1:44:55<04:58, 37.37s/it]

0.38232314327619216


 95%|█████████████████████████████████████▏ | 143/150 [1:45:39<04:35, 39.33s/it]

0.38106546068567354


 96%|█████████████████████████████████████▍ | 144/150 [1:46:22<04:02, 40.44s/it]

0.3562788350440077


 97%|█████████████████████████████████████▋ | 145/150 [1:47:03<03:23, 40.73s/it]

0.3511315162190756


 97%|█████████████████████████████████████▉ | 146/150 [1:47:45<02:44, 41.03s/it]

0.36104508187989615


 98%|██████████████████████████████████████▏| 147/150 [1:48:25<02:02, 40.78s/it]

0.3615086747250917


 99%|██████████████████████████████████████▍| 148/150 [1:49:06<01:21, 40.95s/it]

0.3693399780324968


 99%|██████████████████████████████████████▋| 149/150 [1:49:50<00:41, 41.65s/it]

0.34528868158314224


100%|███████████████████████████████████████| 150/150 [1:50:30<00:00, 41.20s/it]

OSError: Cannot save file into a non-existent directory: 'results'

In [106]:
df_res = pd.DataFrame(model_results)

In [33]:
df_res.to_csv('lda_tuning_results.csv', index=False)

In [31]:
df_res = pd.read_csv('lda_tuning_results.csv')

In [34]:
df_res.sort_values(by='Coherence')

Unnamed: 0,Topics,Alpha,Beta,Coherence
14,5,0.51,0.01,0.248649
44,10,0.51,0.01,0.249417
19,5,0.76,0.01,0.250682
49,10,0.76,0.01,0.250773
3,5,0.01,0.01,0.253108
...,...,...,...,...
111,20,0.76,0.51,0.398717
36,10,0.01,0.51,0.408103
112,20,0.76,0.76,0.413216
141,25,0.76,0.51,0.424364


Я просмотрел несколько лучших вариантов по параметрам, и чсто визуально мне больше вссего понравился такйо вариант:

In [59]:
lda_model, cv = train_compute_coherence_save_chart(name='best_model_vis', corpus=corpus, dictionary=dictionary, 
                                              k=15, a=0.51, b=0.51)

0.3848752897496348


In [70]:
lda_model.save('best_model.model')