https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

In [17]:
import spacy
from spacy.lang.en import English
parser = English()

In [18]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [19]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/philipcastiglione/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/philipcastiglione/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [22]:
import random
text_data = []
# file_path = '../data/small_reviews.csv'
file_path = '../data/all_reviews.csv'
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['tutto', 'perfetto', 'grazie', 'mille']
['stay', 'daniel', 'great', 'welcome', 'need', 'apartment', 'spotless', 'clean', 'important', 'bike', 'amsterdam', 'great', 'amsterdam', 'daniel']
['daniel', 'perfect', 'arrive', 'welcome', 'explain', 'everything', 'concern', 'apartment', 'live', 'apartment', 'beautifully', 'furnited', 'clean', 'comfortable', 'advice', 'sightseeing', 'restaurant', 'enjoy', 'would', 'anytime']
['friendly', 'super', 'clean', 'decent', 'great', 'location', 'everything', 'nearby', 'public', 'transport']
['boyfriend', 'stay', 'apartment', 'night', 'arrive', '1.30pm', 'however', 'miss', 'flight', 'arrive', '3.15pm', 'message', 'morning', 'explain', 'would', 'couple', 'hours', 'miss', 'flight', 'reply', 'thanks', 'ruining', 'plan', 'could', 'understand', 'would', 'annoying', 'response', 'however', 'really', 'enjoy', 'apartment', 'bedroom', 'super', 'comfortable', 'place', 'clean', 'great', 'location', 'central', 'station', 'right', 'outside', 'need', 'include', 'touris

In [23]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [24]:
import gensim

In [25]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.007*"centro" + 0.007*"lugar" + 0.006*"amsterdam" + 0.005*"muito"')
(1, '0.012*"appartement" + 0.011*"centre" + 0.010*"séjour" + 0.009*"quartier"')
(2, '0.010*"alles" + 0.009*"wohnung" + 0.008*"amsterdam" + 0.008*"super"')
(3, '0.028*"really" + 0.018*"house" + 0.010*"clean" + 0.010*"center"')
(4, '0.036*"great" + 0.031*"place" + 0.029*"apartment" + 0.023*"location"')


In [None]:
'''
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))
'''

In [26]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.007*"amsterdam" + 0.007*"alles" + 0.006*"cancel" + 0.006*"reservation"')
(1, '0.017*"appartement" + 0.010*"centre" + 0.009*"séjour" + 0.008*"quartier"')
(2, '0.032*"great" + 0.028*"place" + 0.026*"apartment" + 0.020*"location"')


In [27]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.009*"muito" + 0.008*"noisy" + 0.006*"laura" + 0.006*"очень"')
(1, '0.032*"place" + 0.030*"apartment" + 0.025*"great" + 0.022*"location"')
(2, '0.006*"accueillante" + 0.005*"knowledgeable" + 0.004*"empfehlen" + 0.004*"amsterdam"')
(3, '0.015*"molto" + 0.014*"centro" + 0.008*"appartamento" + 0.007*"suite"')
(4, '0.013*"appartment" + 0.012*"simon" + 0.009*"negative" + 0.008*"concern"')
(5, '0.031*"appartement" + 0.016*"séjour" + 0.015*"quartier" + 0.014*"avon"')
(6, '0.042*"great" + 0.030*"place" + 0.028*"apartment" + 0.024*"amsterdam"')
(7, '0.043*"house" + 0.011*"check" + 0.009*"experience" + 0.009*"treat"')
(8, '0.011*"centre" + 0.009*"passé" + 0.007*"chambre" + 0.007*"proximité"')
(9, '0.018*"alles" + 0.015*"wohnung" + 0.011*"amsterdam" + 0.011*"wieder"')


In [28]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [29]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [30]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [32]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 40, id2word=dictionary, passes=15)
ldamodel.save('model40.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(18, '0.059*"molto" + 0.036*"centro" + 0.025*"accogliente" + 0.022*"tutto"')
(3, '0.046*"appartment" + 0.036*"uncomplicated" + 0.034*"очень" + 0.026*"квартира"')
(38, '0.022*"sightseeing" + 0.020*"neighbour" + 0.018*"immediate" + 0.013*"gorgeous"')
(22, '0.040*"access" + 0.038*"design" + 0.035*"patio" + 0.029*"prefer"')
(26, '0.049*"place" + 0.035*"amsterdam" + 0.032*"great" + 0.031*"recommend"')
(37, '0.034*"recommand" + 0.026*"stove" + 0.025*"quality" + 0.021*"минут"')
(39, '0.033*"tramway" + 0.031*"heavy" + 0.028*"renting" + 0.027*"suitcase"')
(31, '0.025*"apartment" + 0.018*"place" + 0.018*"small" + 0.016*"bathroom"')
(8, '0.028*"équipé" + 0.022*"situe" + 0.022*"logement" + 0.019*"salle"')
(5, '0.027*"restaurant" + 0.022*"place" + 0.021*"apartment" + 0.020*"amsterdam"')
(17, '0.059*"clean" + 0.047*"house" + 0.042*"locate" + 0.032*"everything"')
(29, '0.066*"l\'appartement" + 0.050*"confortable" + 0.043*"situé" + 0.024*"spacieux"')
(13, '0.024*"outdoor" + 0.023*"cable" + 0.022*"summ

In [33]:
lda40 = gensim.models.ldamodel.LdaModel.load('model40.gensim')
lda_display40 = pyLDAvis.gensim.prepare(lda40, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display40)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
