

### Импорты



In [None]:
!pip install pymorphy2

In [None]:
!pip install pyLDAvis==3.4.1

In [None]:
import pandas as pd
import re
import numpy as np

import nltk
from nltk.corpus import stopwords
import pymorphy2
import itertools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV, train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

import gensim
from gensim.models import CoherenceModel
import gensim.corpora as corpora
from tqdm import tqdm_notebook
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
#import pyLDAvis.gensim
from gensim.corpora import Dictionary

import warnings
warnings.filterwarnings('ignore')
RAND = 10

nltk.download('stopwords')
nltk.download('punkt')


### Основная часть

In [5]:
df = pd.read_csv(PATH)

In [None]:
df.info()

### Обработка текста

In [8]:
morph = pymorphy2.MorphAnalyzer()

In [9]:
def process_text(data, stopwords = stopwords.words('russian')):
  text = re.sub('ё', 'е', data.lower())
  text = re.sub(r'([.,!?])', r' \1', text)
  text = re.sub(r'[^а-яА-Я\s]+', '', text)
  text = text.strip()
  text = [w for w in text.split() if w not in stopwords]
  text = [w for w in text if len(w) >= 3]
  return ' '.join(text)

def lemmatization_text(data, morph = morph):
  result = ' '.join([morph.parse(x)[0].normal_form for x in data.split()])
  result = ' '.join([x for x in result.split() if x not in STOPWORDS])
  return result

def get_result(data, morph = morph, stopwords = stopwords.words('russian')):
  if data is not np.NaN:
    result = process_text(data = data)
    result = lemmatization_text(result)
    #result = result.replace('', '')
    return result
  return ''

In [12]:

test_df = pd.DataFrame(columns = df.columns)
for elem in l:
  test_df = pd.concat([test_df, df[df['Область'] == elem].head(2900)], ignore_index=True) 

In [14]:
X_train, X_test = train_test_split(test_df['text'],
                                   test_size=0.2,
                                   random_state=42)

In [15]:
def transform_data(data: pd.Series) ->list:
  transform_list = [get_result(data=i) for i in data]
  transform_list = [x for x in transform_list if x not in [None, '']]
  return transform_list

train_list = transform_data(X_train)
test_list = transform_data(X_test)

### Токенизация

In [22]:
vector_ben = CountVectorizer(
    analyzer = 'word',
    min_df = 19,
    ngram_range = (2,3),
    stop_words = stopwords.words('russian') + STOPWORDS
)
train_vec_ben = vector_ben.fit_transform(train_list)


In [17]:
len(vector_ben.get_feature_names_out())

4635

In [18]:
vector_ben.get_feature_names_out()

array(['аварийный дом', 'аварийный жилищный', 'аварийный жилищный фонд',
       ..., 'ярцевск загорский', 'ярцевск загорский весь', 'ёлка желание'],
      dtype=object)

In [19]:
vector_ben

In [None]:
num_topics = 7
lda_model_ben = LatentDirichletAllocation(
    n_components = num_topics,
    learning_method = 'online',
    random_state = RAND,
    n_jobs = -1
    )
lda_model_ben.fit(train_vec_ben)
print(lda_model_ben)

In [None]:
print(lda_model_ben.components_)

In [None]:
tokenized_train_list = [doc.split() for doc in train_list]

In [None]:
dictionary_ben = Dictionary(tokenized_train_list)


In [None]:
panel = pyLDAvis.lda_model.prepare(
    lda_model_ben,
    train_vec_ben,
    vector_ben,
    mds = 'tsne'
)

pyLDAvis.display(panel)

In [None]:
print('Log Likelihood: ', lda_model_ben.score(train_vec_ben))

print('Perplexity: ', lda_model_ben.perplexity(train_vec_ben))

In [None]:
def get_coherence_mean(model, texts, n_top_words = 20):
  topics = model.components_
  
  texts = [[word for word in doc.split()] for doc in texts]


  dictionary = corpora.Dictionary(texts)
  corpus = [dictionary.doc2bow(text) for text in texts]

  feature_names = [dictionary[i] for i in range(len(dictionary))]

  top_words = []

  for topic in topics:
    top_words.append(
        [feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]
      
    )

  coherence_model = CoherenceModel(
      topics = top_words,
      texts = texts,
      dictionary = dictionary,
      coherence = 'c_v'
  )
  coherence = coherence_model.get_coherence()
  return coherence

In [None]:
get_coherence_mean(lda_model_ben, test_list)

In [None]:
def compute_coherence_values(vec_train_text,
                             test_text,
                             min_topic = 3,
                             max_topic = 19,
                             step = 3):
  coherence_values =[]
  model_list = []
  
  for num_topics in tqdm_notebook(range(min_topic, max_topic, step)):
    model = LatentDirichletAllocation(
        max_iter = 10,
        n_components = num_topics,
        random_state=RAND,
        n_jobs=-1
    )
    model.fit(vec_train_text)
    model_list.append(model)
    coherence_values.append(get_coherence_mean(model, test_text))

  return model_list, coherence_values                    

In [None]:
min_topic = 4
max_topic = 12
step = 2

model_list, coherence_values = compute_coherence_values(vec_train_text = train_vec_ben,
                                                        test_text=test_list,
                                                        min_topic=min_topic,
                                                        max_topic=max_topic,
                                                        step = step)



In [None]:
x = range(min_topic, max_topic, step)

plt.figure(figsize =(15, 7))

plt.plot(x, coherence_values)
plt.xlabel('Num_topics')
plt.ylabel('Coherence Score')

In [None]:
for n_topics, coherence in zip(x, coherence_values):
  print(f'Num topics = {n_topics}, Coherence = {coherence:.3f}')

In [None]:
def predict_topic(data, vectorizer = vector_ben, model = lda_model_ben):
  transform_text = get_result(data)
  vec_text = vectorizer.transform([transform_text])
  topic_proba = model.transform(vec_text)[0]

  if np.all(topic_proba == topic_proba[0]):
    return -9
  return np.argmax(topic_proba)

In [None]:
test_df['top_topic'] = test_df['text'].transform(predict_topic)

In [None]:
def show_topics(vectorizer, model, n_words = 20):
  feature_names = np.array(vectorizer.get_feature_names_out())
  top_words = []

  for topic_weights in model.components_:
    top_keyword_locs = (-topic_weights).argsort()[:n_words]
    top_words.append(feature_names.take(top_keyword_locs))

  return top_words

In [None]:
topic_keywords = show_topics(vectorizer = vector_ben,
                             model = lda_model_ben,
                             n_words = 15)

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = [
    'Word' + str(i) for i in range(df_topic_keywords.shape[1])
]

df_topic_keywords.index = [
    'Topic' + str(i+1) for i in range(df_topic_keywords.shape[0])
]

df_topic_keywords

In [None]:
sns.set_style('white')
sns.countplot(x = 'top_topic', data = test_df[test_df['top_topic'] != -9])