In [1]:
# want to make clean words and return a list of tokens
import spacy
parser = spacy.load("en_core_web_sm")

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens        

In [2]:
sent = '@bob said the #chicken was at the #junkyard. See http://www.jonathanmugan.com.'
out_tokens = tokenize(sent)
print(out_tokens)

['SCREEN_NAME', 'said', 'the', '#', 'chicken', 'was', 'at', 'the', '#', 'junkyard', '.', 'see', 'URL', '.']


In [3]:
import nltk

# Download the 'wordnet' package if you don't have it
# nltk.download('wordnet')

In [4]:
# We want to lemmatize so dogs goes to dog and ran goes to run
# Lemmatizations means to get the "dictionary entry" for a word

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
# or can use this
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [5]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [6]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [7]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [8]:
sent = 'I enjoy going to restaurants to eat hamburgers.'
print(prepare_text_for_lda(sent))

['enjoy', 'going', 'restaurant', 'hamburger']


In [9]:
# Get the data
import random
import os

text_data = []
filepath = 'jonathan_mugan_tweets.txt'

with open(filepath) as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .95:
            print(tokens)
        text_data.append(tokens)

['finger', 'try', 'package', 'battery', 'amaze', 'packaging', 'madness', 'trigger', 'lawsuit']
['picture', 'normally', 'worth', 'thousand', 'words', 'picture', 'hotel', 'website', 'somehow', 'convey', 'information']
['looking', 'refrigerator', 'harvest', 'world']
['recently', 'watch', 'safety', 'guarantee', 'seem', 'overrate', 'would']
['think', 'another', 'century', 'actually', 'guess']
['watch', 'place', 'beyond', 'pine', 'gosling', 'open', 'movie', 'carnival', 'remind', 'notebook', 'end', 'differently', 'though']
['follow', 'someone', 'would', 'showing', 'follow']
['would', 'useful', 'could', 'iphone', 'kindly']
['drove', 'water', 'softener', 'front', 'hassle', 'carsarestupid']
['crazy', 'every', 'video', 'application', 'feel', 'compel', 'permission', 'screen', 'fullscreen', 'button']
['smoke', 'alarm', 'hotel', 'chirp', 'could', 'impervious']
['introduce', 'graph', 'theory', 'explain', 'node', 'people', 'links', 'friendship']
['email', 'someone', 'official', 'answer', 'person', 'an

In [10]:
# create a dictionary fromthe data
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [11]:
# convert to a bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in text_data]

In [12]:
# Create an output/ directory
import os
os.makedirs('output', exist_ok=True)

In [13]:
# save the corpus and dictionary, we will use these in another video to visualize
import pickle
pickle.dump(corpus, open(os.path.join('output', 'corpus.pkl'), 'wb'))
dictionary.save(os.path.join('output', 'dictionary.gensim'))

In [14]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS,
                                           id2word=dictionary, passes=15)
ldamodel.save(os.path.join('output', 'model5.gensim'))

In [15]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topics)

[(0, '0.011*"people" + 0.011*"every" + 0.010*"would" + 0.008*"thing"'), (1, '0.016*"always" + 0.010*"think" + 0.008*"coffee" + 0.007*"could"'), (2, '0.012*"would" + 0.009*"realize" + 0.008*"great" + 0.008*"writing"'), (3, '0.018*"people" + 0.013*"movie" + 0.011*"things" + 0.010*"could"'), (4, '0.009*"funny" + 0.009*"going" + 0.008*"child" + 0.007*"memory"')]
[(0, '0.011*"people" + 0.011*"every" + 0.010*"would" + 0.008*"thing"'), (1, '0.016*"always" + 0.010*"think" + 0.008*"coffee" + 0.007*"could"'), (2, '0.012*"would" + 0.009*"realize" + 0.008*"great" + 0.008*"writing"'), (3, '0.018*"people" + 0.013*"movie" + 0.011*"things" + 0.010*"could"'), (4, '0.009*"funny" + 0.009*"going" + 0.008*"child" + 0.007*"memory"')]
[(0, '0.011*"people" + 0.011*"every" + 0.010*"would" + 0.008*"thing"'), (1, '0.016*"always" + 0.010*"think" + 0.008*"coffee" + 0.007*"could"'), (2, '0.012*"would" + 0.009*"realize" + 0.008*"great" + 0.008*"writing"'), (3, '0.018*"people" + 0.013*"movie" + 0.011*"things" + 0.010

In [16]:
# try a new document
# we it is mostly topic 3
new_doc = 'I watch movies.'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(8, 1), (191, 1)]
[(0, 0.06784614), (1, 0.06666765), (2, 0.06738649), (3, 0.7314321), (4, 0.06666769)]


In [17]:
# try three topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3,
                                           id2word=dictionary, passes=15)
ldamodel.save(os.path.join('output', 'model3.gensim'))
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topics)

[(0, '0.012*"people" + 0.011*"movie" + 0.011*"watch" + 0.010*"would"'), (1, '0.008*"always" + 0.007*"amaze" + 0.007*"reading" + 0.007*"funny"'), (2, '0.010*"funny" + 0.010*"going" + 0.008*"remember" + 0.006*"memory"')]
[(0, '0.012*"people" + 0.011*"movie" + 0.011*"watch" + 0.010*"would"'), (1, '0.008*"always" + 0.007*"amaze" + 0.007*"reading" + 0.007*"funny"'), (2, '0.010*"funny" + 0.010*"going" + 0.008*"remember" + 0.006*"memory"')]
[(0, '0.012*"people" + 0.011*"movie" + 0.011*"watch" + 0.010*"would"'), (1, '0.008*"always" + 0.007*"amaze" + 0.007*"reading" + 0.007*"funny"'), (2, '0.010*"funny" + 0.010*"going" + 0.008*"remember" + 0.006*"memory"')]


In [18]:
# try ten topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10,
                                           id2word=dictionary, passes=15)
ldamodel.save(os.path.join('output', 'model10.gensim'))
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topics)

[(0, '0.019*"would" + 0.016*"funny" + 0.015*"people" + 0.013*"problem"'), (1, '0.013*"today" + 0.011*"funny" + 0.011*"starting" + 0.009*"article"'), (2, '0.012*"remind" + 0.011*"expect" + 0.010*"check" + 0.008*"appear"'), (3, '0.017*"always" + 0.015*"thought" + 0.013*"movie" + 0.011*"computer"'), (4, '0.014*"going" + 0.012*"people" + 0.012*"reading" + 0.012*"movie"'), (5, '0.012*"people" + 0.010*"think" + 0.010*"come" + 0.009*"world"'), (6, '0.011*"store" + 0.010*"terrible" + 0.010*"point" + 0.008*"sugar"'), (7, '0.021*"remember" + 0.018*"funny" + 0.014*"memory" + 0.013*"pretty"'), (8, '0.028*"would" + 0.015*"watch" + 0.012*"recently" + 0.011*"crazy"'), (9, '0.032*"dream" + 0.014*"child" + 0.013*"coffee" + 0.011*"drink"')]
[(0, '0.019*"would" + 0.016*"funny" + 0.015*"people" + 0.013*"problem"'), (1, '0.013*"today" + 0.011*"funny" + 0.011*"starting" + 0.009*"article"'), (2, '0.012*"remind" + 0.011*"expect" + 0.010*"check" + 0.008*"appear"'), (3, '0.017*"always" + 0.015*"thought" + 0.013

In [19]:
# Exercise: Run LDA on Newsgroup Data
# The Newsgroup Data
# http://scikit-learn.org/stable/datasets/twenty_newsgroups.html#newsgroups
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train')
print(dir(texts))
# 11,314 posts
print(len(texts.target))
print(texts.target)
print(texts.target_names)
print(texts.data[0])

['DESCR', 'data', 'filenames', 'target', 'target_names']
11314
[7 4 4 ... 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, 