In [1]:
import spacy
spacy.load("fr_core_news_sm")
from spacy.lang.fr import French
parser = French()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /Users/Phoenix/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Phoenix/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [6]:
text_data = []
with open('output.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        text_data.append(tokens)

In [9]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [13]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=80)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.014*"hallyday" + 0.011*"paris" + 0.011*"johnny" + 0.011*"meghan" + 0.009*"markle" + 0.007*"laeticia" + 0.006*"middleton" + 0.006*"maison"')
(1, '0.027*"comment" + 0.017*"meteo" + 0.016*"résolu" + 0.016*"toujours" + 0.015*"fermé" + 0.012*"faire" + 0.012*"coupe" + 0.009*"femmes"')
(2, '0.026*"résolu" + 0.025*"fermé" + 0.009*"pourquoi" + 0.008*"après" + 0.006*"couple" + 0.006*"résultat" + 0.005*"moins" + 0.005*"quelle"')
(3, '0.024*"france" + 0.017*"macron" + 0.011*"météo" + 0.011*"prince" + 0.010*"brigitte" + 0.007*"emmanuel" + 0.006*"enfants" + 0.006*"grand"')
(4, '0.013*"monde" + 0.008*"femme" + 0.008*"harry" + 0.007*"ligne" + 0.006*"carré" + 0.005*"français" + 0.005*"cette" + 0.005*"horoscope"')


In [16]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(877, 1)]
[(0, 0.05000529), (1, 0.05000529), (2, 0.05000529), (3, 0.05000529), (4, 0.05000529), (5, 0.5499524), (6, 0.05000529), (7, 0.05000529), (8, 0.05000529), (9, 0.05000529)]


In [15]:
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=80)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.020*"avant" + 0.017*"middleton" + 0.016*"harry" + 0.013*"laura"')
(1, '0.028*"faire" + 0.019*"votre" + 0.016*"amour" + 0.013*"coloriage"')
(2, '0.061*"france" + 0.026*"météo" + 0.019*"recettes" + 0.016*"pourquoi"')
(3, '0.018*"mariage" + 0.017*"maison" + 0.015*"ligne" + 0.014*"famille"')
(4, '0.035*"paris" + 0.014*"euro" + 0.013*"nouvelle" + 0.013*"français"')
(5, '0.021*"coupe" + 0.020*"prince" + 0.013*"femme" + 0.012*"cette"')
(6, '0.082*"résolu" + 0.076*"fermé" + 0.035*"toujours" + 0.035*"après"')
(7, '0.044*"meteo" + 0.031*"monde" + 0.017*"contre" + 0.014*"enfants"')
(8, '0.039*"hallyday" + 0.032*"johnny" + 0.031*"meghan" + 0.027*"markle"')
(9, '0.036*"macron" + 0.023*"brigitte" + 0.021*"fille" + 0.016*"grand"')


In [22]:
import pyLDAvis
import pyLDAvis.gensim as gensimvis
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
