In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import requests

from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords 

from pymystem3 import Mystem
import pymorphy2

import requests

from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords 

from gensim.models.phrases import Phrases


from gensim.models import LdaMulticore #, LdaModel
from gensim import corpora
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import warnings
warnings.filterwarnings("ignore")

# Modules of Functions for preprocessing

### Save data to DataFrame

In [2]:
df = pd.read_csv('./tales.csv', sep='\t')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: './tales.csv'

### Separate tales
delete classes, make lowercase and separate on 3 groups:

0. adult tales
1. children tales
2. all ages tales

In [3]:
def tales_separator(df, category):
    df = df[df['Tale,Label'].str.endswith(category)].copy()
    df['Tale,Label'] = df['Tale,Label'].str[:-4].str.lower()
    return df

In [4]:
adult_tales = tales_separator(df, '0')
children_tales = tales_separator(df, '1')
all_ages_tales = tales_separator(df, '2')

NameError: name 'df' is not defined

### Merge tales with target column

In [5]:
def merge_topics(df_dict, col, target):
    result_df = pd.DataFrame(columns = [col, target])
    for key in df_dict.keys():
        df_dict[key][target] = key
        result_df = pd.concat([result_df, df_dict[key]])
    return result_df

In [None]:
df_dict = {0 : children_tales,
           1 : adult_tales,
           2 : all_ages_tales}

df_raw = merge_topics(df_dict, 'Tale,Label', 'target')
df_raw.head()

### Clearing lines with Regular Expressions

1. "3-ий1" case - [0-9]{1,}-[0-9,а-я]{1,}
2. numbers - [0-9]
3. extra symbols - [-,.?:;!»«/\—)(#$%^&*№%'"]
4. spaces greater than 2 - [ ]{1,}
5. start / end spaces

In [None]:
def regular_cleaning(df, col):
    temp = 0
    for row in df.index:
        temp = re.sub(r"[0-9]{1,}-[0-9,а-я]{1,}", "", df[col][row])
        temp = re.sub(r"[0-9]", " ", temp)
        temp = re.sub(r'"', " ", temp)
        temp = re.sub(r"[-,.?:;!»«/\–)(#$%^&*№%']", " ", temp)
        temp = re.sub(r"[ ]{1,}", " ", temp)
        if temp.endswith(" "):
            temp = temp[:-1]
        if temp.startswith(" "):
            temp = temp[1:]
        df[col][row] = temp
    return df

In [None]:
# children_tales = regular_cleaning(children_tales, 'Tale,Label')

### Lemmatize lines (1) with pymystem3

In [None]:
m = Mystem()

def lemmatize_str(sent, myStemObj):
    lemmat_list = []
    try:
        lemmas = myStemObj.lemmatize(sent)
        for i in lemmas:
            if i.isalpha():
                lemmat_list.append(i)
        return lemmat_list
    except BrokenPipeError:
        print(sent)

In [None]:
# children_tales['Tale,Label'] = children_tales['Tale,Label'].apply(lambda x: lemmatize_str(x, m))

### Lemmatize lines (2) with pymorphy2

In [None]:
morph = pymorphy2.MorphAnalyzer()

def lemmat_pymorph(sent, morph):
    sent = sent.split()
    return [morph.parse(word)[0].normal_form for word in sent]

In [None]:
# children_tales['Tale,Label'] = children_tales['Tale,Label'].apply(lambda x: lemmat_pymorph(x, morph))

### Remove Stop words
create big stop words dictionary and write clearing stop words function

In [None]:
# download stop words list from github
r = requests.get('https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt')
stop_list = r.text.split()

# merge NLTK & github stop words lists
stop_words_dict = stopwords.words('russian')
print("NLTK Stop words dictionary length:", len(stop_words_dict), type(stop_words_dict))
stop_words_dict.extend(stop_list)
stop_words_dict = list(set(stop_words_dict))
stop_words_dict.remove('не')
stop_words_dict.append('—')

print("Final Stop words dictionary length:", len(stop_words_dict), type(stop_words_dict))

In [None]:
def stop_words_remover(corpus, stopWords, col):
    for num_sent in corpus.index:
        temp_dialog_list = corpus[col].loc[num_sent]
        temp_dialog_list = [word for word in temp_dialog_list if not word in stopWords]
        corpus[col].loc[num_sent] = temp_dialog_list
    return corpus

In [None]:
# children_tales = stop_words_remover(children_tales, stop_words_dict, 'Tale,Label')

### Make Bigrams & Trigrams

In [None]:
def n_gram_maker(df, col, min_count = 4, threshold = 10):
    min_count = min_count - 1
    tmp_dict_list = list(df[col])
    ngrams = Phrases(tmp_dict_list, min_count = min_count, threshold = threshold)
    list(ngrams[tmp_dict_list])
    return list(ngrams[tmp_dict_list])

bigrams

In [None]:
# bigrams_children_tales = children_tales.copy()
# bigrams_children_tales['Tale,Label'] = n_gram_maker(children_tales, 'Tale,Label', 8, 7)

### Merge "НЕ" + next_word 

In [None]:
def HE_merge(wd_list):
    ind_list = []
    for i, wd in enumerate(wd_list):
        if wd == 'не' and i != (len(wd_list) - 1):
            wd_list[i] = wd_list[i] + '_' + wd_list[i+1]
            ind_list.append(wd_list[i+1])
    return list(filter(lambda x: x not in ind_list, wd_list)) if ind_list else wd_list

In [None]:
# children_tales['Tale,Label'] = children_tales['Tale,Label'].apply(lambda x: HE_merge(x))

### Remove "НЕ" 

In [None]:
def HE_remove(wd_list):
    return list(filter(lambda x: x != 'не', wd_list))

In [None]:
# children_tales['Tale,Label'] = children_tales['Tale,Label'].apply(lambda x: HE_remove(x))

### Remove word(s)

In [None]:
def wds_remover(line, wds_list):
    return list(filter(lambda x: x not in wds_list, line))

In [None]:
# children_tales['Tale,Label'] = children_tales['Tale,Label'].apply(lambda line: wds_remover(line, wds_list))

# Functions for preprocessing

### prepoc_1
### * reg_exp + pymorphy + stop_words + HE_remove

In [None]:
def prepoc_1(data, col, stop_words_dict):
    data = regular_cleaning(data, col)
    data[col] = data[col].apply(lambda x: lemmat_pymorph(x, morph))
    data = stop_words_remover(data, stop_words_dict, col)
    data[col] = data[col].apply(lambda x: HE_remove(x))
    return data

### prepoc_2
### * reg_exp + mystem + stop_words + HE_remove

In [None]:
def prepoc_2(data, col, stop_words_dict):
    data = regular_cleaning(data, col)
    data[col] = data[col].apply(lambda x: lemmatize_str(x, m))
    data = stop_words_remover(data, stop_words_dict, col)
    data[col] = data[col].apply(lambda x: HE_remove(x))
    return data

### prepoc_3
### reg_exp + mystem + stop_words + HE_merge

In [None]:
def prepoc_3(data, col, stop_words_dict):
    data = regular_cleaning(data, col)
    data[col] = data[col].apply(lambda x: lemmatize_str(x, m))
    data = stop_words_remover(data, stop_words_dict, col)
    data[col] = data[col].apply(lambda x: HE_merge(x))
    return data

### prepoc_4
### reg_exp + mystem + stop_words + bigram + HE_remove

In [None]:
def prepoc_4(data, col, stop_words_dict, bgm_freq = 8, threshold = 7):
    data = regular_cleaning(data, col)
    data[col] = data[col].apply(lambda x: lemmatize_str(x, m))
    data = stop_words_remover(data, stop_words_dict, col)
    data[col] = n_gram_maker(data, col, bgm_freq, threshold)
    data[col] = data[col].apply(lambda x: HE_remove(x))
    return data

### prepoc_5
### reg_exp + mystem + stop_words + trigram + HE_remove

In [None]:
def prepoc_5(data, col, stop_words_dict, bgm_freq = 8, tgm_freq = 5, threshold = 7):
    data = regular_cleaning(data, col)
    data[col] = data[col].apply(lambda x: lemmatize_str(x, m))
    data = stop_words_remover(data, stop_words_dict, col)
    data[col] = n_gram_maker(data, col, bgm_freq, threshold)
    data[col] = n_gram_maker(data, col, tgm_freq, threshold)
    data[col] = data[col].apply(lambda x: HE_remove(x))
    return data

### prepoc_6 EXTRA
удаляем все слова кроме НЕ и:
* прилагательное (A)
* наречие (ADV)
* местоимение-прилагательное(APRO)
* часть композита - сложного слова (COM)
* существительное (S)
### reg_exp + mystem + stop_words + part_of_speech_filt + bigram + HE_remove

# Preprocess data & Best model after different preprocessing techniques

Copy main dataset 5 times for 5 dif preprocessings

In [None]:
df_raw_1 = df_raw.copy()
df_raw_2 = df_raw.copy()
df_raw_3 = df_raw.copy()
df_raw_4 = df_raw.copy()
df_raw_5 = df_raw.copy()
col = 'Tale,Label'

In [None]:
df_prep_1 = prepoc_1(df_raw_1, col, stop_words_dict)
print(*df_raw_1['Tale,Label'].iloc[1000])

In [None]:
df_prep_2 = prepoc_2(df_raw_2, col, stop_words_dict)
print(*df_raw_2['Tale,Label'].iloc[1000])

In [None]:
df_prep_3 = prepoc_3(df_raw_3, col, stop_words_dict)
print(*df_raw_3['Tale,Label'].iloc[1000])

In [None]:
df_prep_4 = prepoc_4(df_raw_4, col, stop_words_dict, bgm_freq = 8, threshold = 7)
print(*df_raw_4['Tale,Label'].iloc[1000])

In [None]:
df_prep_5 = prepoc_5(df_raw_5, col, stop_words_dict, bgm_freq = 8, tgm_freq = 5, threshold = 7)
print(*df_raw_5['Tale,Label'].iloc[1000])

Run 5 LDA default models with dif preprocessing algorithms to find the best one.

number of topics (n_topics) = 3

In [None]:
docs = [df_prep_1, df_prep_2, df_prep_3, df_prep_4, df_prep_5]

for i, doc in enumerate(docs):
    doc_list = list(doc[col])
    n_topics = 3
    n_words = 12
    
    dct = corpora.Dictionary(doc_list)
    corpus = [dct.doc2bow(line) for line in doc_list]

    lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         num_topics=n_topics,
                         passes=10,
                         random_state=2022
                        )
    
    print(i + 1, 'preproc.')
    print('Perplexity: ', lda_model.log_perplexity(corpus))
    coherence_model_lda = CoherenceModel(model=lda_model, texts=doc_list, dictionary=dct, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: ', coherence_lda, '\n')

Best preproc on LdaMulticore model. Results

In [None]:
def get_LDA_res(doc, col, n_words = 12):
    doc_list = list(doc[col])
    n_topics = 3

    dct = corpora.Dictionary(doc_list)
    corpus = [dct.doc2bow(line) for line in doc_list]
    lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         num_topics= n_topics,
                         passes=10,
                         random_state=2022
                        )

    print(i + 1, 'preproc.')
    print('Perplexity: ', lda_model.log_perplexity(corpus))
    coherence_model_lda = CoherenceModel(model=lda_model, texts=doc_list, dictionary=dct, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: ', coherence_lda, '\n')
    return lda_model

In [None]:
doc = df_prep_1

lda_model_prep_1 = get_LDA_res(doc, col)
dct = corpora.Dictionary(list(doc[col]))
corpus = [dct.doc2bow(line) for line in doc_list]

#### First good SCORE
default LdaMulticore with 1 preprocessing techniques
* Perplexity:  -9.151746406174441
* Coherence Score:  0.4056843395969147 

# Find Best Hyperparameters with GridSearch

Links for tuning parameters:
1. https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0
2. https://stackoverflow.com/questions/65014553/how-to-tune-the-parameters-for-gensim-ldamulticore-in-python

Function for learning models with determined params

In [None]:
def compute_coherence_values(doc, col, i,n, ch_size_val, pw_topics_val, decay_val, n_words = 12, mod = False):
    
    doc_list = list(doc[col])
    n_topics = 3
    
    dct = corpora.Dictionary(doc_list)
    corpus = [dct.doc2bow(line) for line in doc_list]
    lda_model = LdaMulticore(corpus=corpus,
                                           id2word=dct,
                                           num_topics=n_topics,
                                           random_state=2022,
                                           passes=10,
                                           chunksize=ch_size_val,
                                           per_word_topics = pw_topics_val,
                                           decay = decay_val)
    
    print(f'{i} from {n}. ch_size_val = {ch_size_val}, pw_topics_val = {pw_topics_val}, decay_val = {decay_val}.')
    print('Perplexity: ', lda_model.log_perplexity(corpus))
    coherence_model_lda = CoherenceModel(model=lda_model, texts=doc_list, dictionary=dct, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: ', coherence_lda, '\n')
    
    if mod: return lda_model
    return coherence_lda

naive GridSearch

In [None]:
doc = df_prep_1
col = 'Tale,Label'

per_word_topics = [True, False]
chunksize = [100, 500, 1000, 2000, 5000]
decay = [0.5, 1]

i, n = 1, len(per_word_topics) * len(chunksize) * len(decay)

for ch_size_val in chunksize:
    for pw_topics_val in per_word_topics:
        for decay_val in decay:
            _ = compute_coherence_values(doc = doc, col = col, i = i, n = n, 
                                         ch_size_val = ch_size_val, pw_topics_val = pw_topics_val, 
                                         decay_val = decay_val, n_words = 12)
            i += 1

best params are: ch_size_val = 100, pw_topics_val = False, decay_val = 0.5.
* Perplexity:  -9.095068910308957
* Coherence Score:  0.45228439636094936 

reproducing results

In [None]:
dct = corpora.Dictionary(doc_list)
corpus = [dct.doc2bow(line) for line in doc_list]

lda_viz = gensimvis.prepare(lda_model_1, corpus, dct)
lda_viz

In [None]:
wds_list = [']', '[', 'пьер', 'андрей', 
            'наташа', 'ростов', 'нибыть', 'чичиков', 
            'марья', 'анна', 'всякий', 'николай', 
            'de', 'соня', 'иванович', 'вильям', 
            'ежели', 'борис', 'андреевич', 'денисов', 
            'иван', 'анатоль', 'ах'] 

df_prep_1[col] = df_prep_1[col].apply(lambda line: wds_remover(line, wds_list))

In [None]:
doc = df_prep_1
list(doc[col])
col = 'Tale,Label'

ch_size_val = 100
pw_topics_val = False
decay_val = 0.5

lda_best_model = compute_coherence_values(doc = doc, col = col, i = 1, n = 1, 
                                         ch_size_val = ch_size_val, pw_topics_val = pw_topics_val, 
                                         decay_val = decay_val, n_words = 12, mod = True)

Plot graphics

In [None]:
pyLDAvis.enable_notebook()

dct = corpora.Dictionary(list(doc[col]))
corpus = [dct.doc2bow(line) for line in doc_list]

lda_viz = gensimvis.prepare(lda_best_model, corpus, dct)
lda_viz

In [None]:
# save prep data
df_prep_1.to_csv('preproc_1_data.csv', index=False)

# save best model after prep
lda_model_prep_1

# save best model after GS
lda_best_model

#### BEST SCORE
3 from 20. ch_size_val = 100, pw_topics_val = False, decay_val = 0.5.
* Perplexity:  -9.095068910308957
* Coherence Score:  0.45228439636094936 

# Выводы по получившимся топикам:

### Группа 1: бытовые сказки
* Муж, ребенок, мама, парень, девушка, подруга, секс

### Группа 2: нейтральные сказки
* Действующие лица - князь, бог, граф, княжна, государь, генерал, офицер, солдат, графиня
* Специфичные глаголы - глядеть
* Еще характерные слова - гора, лошадь, улыбка, письмо

### Группа 3: сказки про бизнес
* Деньги, власть, компания, бизнес, группировка, клиент, покупатель
* Сюзерен, вассал

# Поиск доминирующей темы в каждом тексте

In [None]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
domin_df = format_topics_sentences(lda_best_model, corpus, list(df_prep_1[col]))
domin_df.head()

domin_df['target'] = df_prep_1['target']
domin_df.head()

In [None]:
def vals_for_accuracy_topics(df, topic_num):
    dominant_df = df[df['target'] == topic_num]
    df_0 = dominant_df[dominant_df['Dominant_Topic'] == 0].shape[0]
    df_1 = dominant_df[dominant_df['Dominant_Topic'] == 1].shape[0]
    df_2 = dominant_df[dominant_df['Dominant_Topic'] == 2].shape[0]
    dominant_cnt = max(df_0, df_1, df_2)
    print(f'total docs: {dominant_df.shape[0]}')
    print(f'docs with dominant topic: {dominant_cnt} \n')
    return dominant_df.shape[0], dominant_cnt

In [None]:
all_vals, domain_vals = 0, 0

for i in range(3):
    tmp_all, tmp_dom = vals_for_accuracy_topics(domin_df, i)
    all_vals += tmp_all
    domain_vals += tmp_dom

acc = domain_vals / all_vals
print(f'Accuracy: {acc}')

### Варианты по улучшению:
1. Построить график распределения слов и удалить самые частые 
2. Удалить имена и отчества
3. Оставить только существительные, прилагательные, наречия, композитарные слова
4. Опробовать BigARTM модель