In [1]:
# want to make clean words and return a list of tokens

from spacy.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens        

In [2]:
sent = '@bob said the #chicken was at the #junkyard. See http://www.jonathanmugan.com.'
out_tokens = tokenize(sent)
print(out_tokens)

['SCREEN_NAME', 'said', 'the', '#', 'chicken', 'was', 'at', 'the', '#', 'junkyard', '.', 'see', 'URL', '.']


In [3]:
# We want to lemmatize so dogs goes to dog and ran goes to run
# Lemmatizations means to get the "dictionary entry" for a word

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
# or can use this
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [4]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [5]:
import nltk
en_stop = set(nltk.corpus.stopwords.words('english'))

In [6]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [7]:
sent = 'I enjoy going to restaurants to eat hamburgers.'
print(prepare_text_for_lda(sent))

['enjoy', 'going', 'restaurant', 'hamburger']


In [8]:
# Get the data
import random
import os

text_data = []
filepath = os.path.join('..','resources', 'jonathan_mugan_tweets.txt')

with open(filepath) as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .95:
            print(tokens)
        text_data.append(tokens)

['spend', 'watching', 'finding', 'bigfoot', 'marathon', 'animal', 'planet', 'fear', 'death', 'sasquatch', 'summer']
['unsplit', 'infinitive', 'satisfy', 'grammar', 'checker', 'stephen', 'pave', 'adverb', 'delete']
['picture', 'normally', 'worth', 'thousand', 'words', 'picture', 'hotel', 'website', 'somehow', 'convey', 'information']
['quiet', 'outside', 'zipper', 'camping', 'memory', 'haiku', 'myfirstone']
['weird', 'cigarette', 'butt', 'consider', 'litter', 'people', 'throw', 'wherever']
['talking', 'monster', 'joke', 'matter']
['movie', 'always', 'scientist']
['want', 'grizzly', 'adams', 'except', 'instead', 'suburb']
['making', 'mistake', 'stage', 'life']
['paper', 'notebook', 'always', 'line', 'rule', 'system']
['google', 'really', 'need', 'paste', 'special', 'default', 'paste', 'without', 'formatting']
['could', 'fully', 'trust', 'documentary', 'filmmaker', 'expect', 'people', 'spend', 'something', 'report']
["haven't", 'twitter', 'whale', 'funny', 'appreciate', 'annoyance']
['tra

In [9]:
# create a dictionary fromthe data
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [10]:
# convert to a bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in text_data]

In [11]:
# save the corpus and dictionary, we will use these in another video to visualize
import pickle
pickle.dump(corpus, open("corpus.pkl", "wb"))
dictionary.save('dictionary.gensim')

In [12]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS,
                                           id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [13]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topics)

[(0, '0.012*"would" + 0.011*"robot" + 0.009*"could" + 0.007*"try"'), (1, '0.017*"funny" + 0.013*"remember" + 0.012*"memory" + 0.008*"seem"'), (2, '0.013*"movie" + 0.009*"world" + 0.007*"computer" + 0.007*"great"'), (3, '0.010*"always" + 0.009*"funny" + 0.009*"think" + 0.009*"coffee"'), (4, '0.015*"watch" + 0.014*"people" + 0.011*"would" + 0.009*"dream"')]
[(0, '0.012*"would" + 0.011*"robot" + 0.009*"could" + 0.007*"try"'), (1, '0.017*"funny" + 0.013*"remember" + 0.012*"memory" + 0.008*"seem"'), (2, '0.013*"movie" + 0.009*"world" + 0.007*"computer" + 0.007*"great"'), (3, '0.010*"always" + 0.009*"funny" + 0.009*"think" + 0.009*"coffee"'), (4, '0.015*"watch" + 0.014*"people" + 0.011*"would" + 0.009*"dream"')]
[(0, '0.012*"would" + 0.011*"robot" + 0.009*"could" + 0.007*"try"'), (1, '0.017*"funny" + 0.013*"remember" + 0.012*"memory" + 0.008*"seem"'), (2, '0.013*"movie" + 0.009*"world" + 0.007*"computer" + 0.007*"great"'), (3, '0.010*"always" + 0.009*"funny" + 0.009*"think" + 0.009*"coffee"'

In [14]:
# try a new document
# we it is mostly topic 3
new_doc = 'I watch movies.'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(8, 1), (193, 1)]
[(0, 0.066820683571410217), (1, 0.066876080239630362), (2, 0.068971485015669895), (3, 0.066667028480243237), (4, 0.73066472269304616)]


In [15]:
# try three topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3,
                                           id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topics)

[(0, '0.011*"funny" + 0.011*"people" + 0.010*"watch" + 0.009*"always"'), (1, '0.005*"could" + 0.005*"really" + 0.005*"going" + 0.005*"things"'), (2, '0.016*"would" + 0.008*"coffee" + 0.008*"people" + 0.007*"computer"')]
[(0, '0.011*"funny" + 0.011*"people" + 0.010*"watch" + 0.009*"always"'), (1, '0.005*"could" + 0.005*"really" + 0.005*"going" + 0.005*"things"'), (2, '0.016*"would" + 0.008*"coffee" + 0.008*"people" + 0.007*"computer"')]
[(0, '0.011*"funny" + 0.011*"people" + 0.010*"watch" + 0.009*"always"'), (1, '0.005*"could" + 0.005*"really" + 0.005*"going" + 0.005*"things"'), (2, '0.016*"would" + 0.008*"coffee" + 0.008*"people" + 0.007*"computer"')]


In [16]:
# try ten topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10,
                                           id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topics)

[(0, '0.028*"remember" + 0.023*"dream" + 0.016*"still" + 0.016*"memory"'), (1, '0.025*"would" + 0.021*"people" + 0.013*"wrong" + 0.010*"world"'), (2, '0.011*"hungry" + 0.010*"try" + 0.009*"people" + 0.009*"years"'), (3, '0.014*"always" + 0.013*"could" + 0.012*"notice" + 0.010*"feel"'), (4, '0.018*"watch" + 0.013*"coffee" + 0.011*"understand" + 0.011*"recently"'), (5, '0.016*"coffee" + 0.015*"article" + 0.014*"write" + 0.012*"drink"'), (6, '0.014*"child" + 0.014*"seem" + 0.011*"story" + 0.011*"people"'), (7, '0.022*"could" + 0.021*"going" + 0.018*"robot" + 0.013*"want"'), (8, '0.020*"people" + 0.012*"make" + 0.012*"first" + 0.011*"google"'), (9, '0.023*"movie" + 0.014*"machine" + 0.012*"computer" + 0.011*"start"')]
[(0, '0.028*"remember" + 0.023*"dream" + 0.016*"still" + 0.016*"memory"'), (1, '0.025*"would" + 0.021*"people" + 0.013*"wrong" + 0.010*"world"'), (2, '0.011*"hungry" + 0.010*"try" + 0.009*"people" + 0.009*"years"'), (3, '0.014*"always" + 0.013*"could" + 0.012*"notice" + 0.010