# Тематическое моделирование

In [3]:
import numpy as np
import pandas as pd
import ast
import nltk
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
from gensim import models
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
from tqdm import tqdm

In [4]:
from google.colab import drive
drive.mount('/content/drive')

  and should_run_async(code)


Mounted at /content/drive


In [7]:
df = pd.read_csv('/content/drive/MyDrive/NLP/Handin/Data/df_tokenized.csv')

  and should_run_async(code)


In [8]:
df.head()

  and should_run_async(code)


Unnamed: 0,tokens,sentiment
0,"['everyth', 'perfect', 'nice', 'clean', 'every...",positive
1,"['appart', 'beautiful', 'veri', 'friendli', 'h...",positive
2,"['spent', 'excel', 'night', 'thi', 'apart', 'g...",positive
3,"['great', 'host', 'super', 'respons', 'make', ...",positive
4,"['properti', 'great', 'locat', 'base', 'look',...",positive


В следующих шагах оставляем только существительные, прилагательные, глаголы и наречия

In [9]:
data = df['tokens'].apply(ast.literal_eval)
data = data.tolist()

  and should_run_async(code)


In [10]:
nlp = spacy.load('en_core_web_sm')

  and should_run_async(code)


In [11]:
final_data = []
for tokens in tqdm(data):
    review = nlp(' '.join(tokens))
    new_rev = []
    for word in review:
        if word.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV']:
            new_rev.append(word)
    final_data.append(new_rev)

  and should_run_async(code)
100%|██████████| 239031/239031 [50:57<00:00, 78.18it/s]


In [12]:
data = [[str(token) for token in tokens] for tokens in tqdm(final_data)]

  and should_run_async(code)
100%|██████████| 239031/239031 [00:05<00:00, 42122.91it/s]


In [13]:
df = pd.DataFrame(
    {'tokens': data,
     'sentiment': df['sentiment']
    })

  and should_run_async(code)


In [14]:
df.to_csv('/content/drive/MyDrive/NLP/Handin/Data/topic_modeling_final.csv', index=False)

  and should_run_async(code)


In [16]:
# data = df['tokens'].apply(ast.literal_eval)
# data = data.tolist()

  and should_run_async(code)


AttributeError: 'list' object has no attribute 'tolist'

In [17]:
data[:2]

  and should_run_async(code)


[['everyth',
  'perfect',
  'nice',
  'clean',
  'everyth_perfect',
  'everyth_perfect_nice'],
 ['appart',
  'beautiful',
  'veri',
  'help',
  'want',
  'friendli_help',
  'help_us',
  'us_want',
  'veri_friendli_help',
  'friendli_help_us']]

### Предварительная обработка данных

##### Создаем словарь со всеми уникальными словами

In [18]:
id2word = corpora.Dictionary(data)

  and should_run_async(code)


In [19]:
corpus = []
for text in data:
    new = id2word.doc2bow(text)
    corpus.append(new)

  and should_run_async(code)


In [20]:
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]


  and should_run_async(code)


In [21]:
len(corpus)

  and should_run_async(code)


239031

In [22]:
corpus = [x for x in corpus if x]

  and should_run_async(code)


In [23]:
len(corpus)

  and should_run_async(code)


238530

##### Фильтр для экстремальных значений

Токены, которые появляются в менее чем 30 отзывах или в более чем 80% отзывов будут отфильтрованы.
Далее сохраняем 10 000 наиболее частых токенов.

In [24]:
id2word.filter_extremes(no_below=30, no_above=0.8, keep_n=10000)

  and should_run_async(code)


In [25]:
print(len(id2word.iteritems()))

10000


  and should_run_async(code)


##### Bag of words

In [26]:
bow_corpus = [id2word.doc2bow(doc) for doc in data]

  and should_run_async(code)


In [27]:
print(bow_corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


  and should_run_async(code)


In [28]:
print(len(bow_corpus))

239031


  and should_run_async(code)


In [29]:
first_review = bow_corpus[0]
for token in range(len(first_review)):
    print('Word {} (\"{}\") appears {} time.'.format(first_review[token][0], id2word[first_review[token][0]], first_review[token][1]))

Word 0 ("clean") appears 1 time.
Word 1 ("everyth") appears 1 time.
Word 2 ("everyth_perfect") appears 1 time.
Word 3 ("nice") appears 1 time.
Word 4 ("perfect") appears 1 time.


  and should_run_async(code)


### Применяем TF-IDF

In [30]:
tfidf_model = TfidfModel(bow_corpus)
corpus_tfidf = tfidf_model[bow_corpus]

  and should_run_async(code)


### Запуск LDA с TF-IDF

In [31]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                            id2word=id2word,
                                            num_topics=6,
                                            random_state=42,
                                            passes=5,
                                            alpha='auto')

  and should_run_async(code)


### Визуализация тем

In [32]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word, mds='mmds', R=15)

  and should_run_async(code)


In [33]:
vis

  and should_run_async(code)


In [34]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx+1, topic))

Topic: 1 
Words: 0.010*"locat_clean" + 0.009*"veri_nice_apart" + 0.008*"nice_room" + 0.007*"amaz_apart" + 0.007*"check_process" + 0.006*"host_nice" + 0.006*"room_nice" + 0.005*"requir" + 0.005*"veri_flexibl" + 0.005*"pillow"
Topic: 2 
Words: 0.036*"great_place" + 0.032*"place_stay" + 0.021*"place_great" + 0.013*"nice_place" + 0.012*"bakeri" + 0.011*"spaciou_apart" + 0.010*"great_place_stay" + 0.010*"stay_great" + 0.008*"veri_nice_place" + 0.008*"apart_love"
Topic: 3 
Words: 0.008*"close" + 0.006*"central" + 0.005*"friendli_host" + 0.005*"definit" + 0.005*"definit_recommend" + 0.004*"flat" + 0.004*"heart" + 0.004*"everyth_perfect" + 0.004*"nice" + 0.004*"metro"
Topic: 4 
Words: 0.010*"great" + 0.009*"apart" + 0.008*"place" + 0.007*"stay" + 0.007*"nice" + 0.006*"host" + 0.005*"recommend" + 0.005*"good" + 0.005*"perfect" + 0.005*"clean"
Topic: 5 
Words: 0.007*"groceri_store" + 0.006*"son" + 0.006*"great_area" + 0.006*"like_stay" + 0.006*"veri_good" + 0.005*"wash_machin" + 0.005*"beauti_vi

  and should_run_async(code)


### Запуск LDA без TF-IDF

In [35]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                            id2word=id2word,
                                            num_topics=6,
                                            random_state=42,
                                            passes=5,
                                            alpha='auto')

  and should_run_async(code)


### Визуализация тем

In [36]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word, mds='mmds', R=15)

  and should_run_async(code)


In [37]:
vis

  and should_run_async(code)


In [38]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx+1, topic))

Topic: 1 
Words: 0.089*"stay" + 0.045*"host" + 0.042*"recommend" + 0.025*"apart" + 0.024*"love" + 0.024*"perfect" + 0.021*"help" + 0.017*"wonder" + 0.017*"veri" + 0.016*"hous"
Topic: 2 
Words: 0.216*"great" + 0.162*"place" + 0.026*"back" + 0.022*"come" + 0.021*"place_stay" + 0.019*"thi_place" + 0.016*"great_place" + 0.015*"great_stay" + 0.015*"great_locat" + 0.015*"come_back"
Topic: 3 
Words: 0.059*"nice" + 0.048*"apart" + 0.040*"good" + 0.035*"close" + 0.026*"easi" + 0.022*"clean" + 0.021*"quiet" + 0.021*"flat" + 0.016*"veri_nice" + 0.014*"veri"
Topic: 4 
Words: 0.035*"walk" + 0.022*"restaur" + 0.019*"station" + 0.016*"lot" + 0.015*"space" + 0.015*"room" + 0.014*"area" + 0.014*"train" + 0.014*"apart" + 0.011*"also"
Topic: 5 
Words: 0.018*"make" + 0.018*"thi" + 0.014*"even" + 0.013*"day" + 0.012*"time" + 0.011*"get" + 0.010*"night" + 0.010*"room" + 0.009*"feel" + 0.009*"best"
Topic: 6 
Words: 0.034*"give" + 0.026*"question" + 0.025*"trip" + 0.021*"quick" + 0.020*"alway" + 0.018*"ani" +

  and should_run_async(code)
