# CH. 7 - TOPIC MODELS
## Activities

#### Activity 7.01

In [1]:
# not necessary
# added to suppress warnings coming from pyLDAvis

import warnings
warnings.filterwarnings('ignore')

In [2]:
import langdetect  # language detection
import matplotlib.pyplot  # plotting
import nltk  # natural language processing
import numpy  # arrays and matrices
import pandas  # dataframes
import pyLDAvis  # plotting
import pyLDAvis.sklearn  # plotting
import regex  # regular expressions
import sklearn  # machine learning

  from collections import Mapping


In [3]:
# define path

path = '~/Documents/packt-data/topic-model-health-tweets/latimeshealth.txt'

In [4]:
# load data

df = pandas.read_csv(path, sep="|", header=None)
df.columns = ["id", "datetime", "tweettext"]

In [5]:
# define quick look function for data frame

def dataframe_quick_look(df, nrows):
    print("SHAPE:\n{shape}\n".format(shape=df.shape))
    print("COLUMN NAMES:\n{names}\n".format(names=df.columns))
    print("HEAD:\n{head}\n".format(head=df.head(nrows)))

In [6]:
dataframe_quick_look(df, nrows=2)

SHAPE:
(4171, 3)

COLUMN NAMES:
Index(['id', 'datetime', 'tweettext'], dtype='object')

HEAD:
                   id                        datetime  \
0  576760256031682561  Sat Mar 14 15:02:15 +0000 2015   
1  576715414811471872  Sat Mar 14 12:04:04 +0000 2015   

                                           tweettext  
0  Five new running shoes that aim to go the extr...  
1  Gym Rat: Disq class at Crunch is intense worko...  



In [7]:
# view final data that will be carried forward

raw = df['tweettext'].tolist()
print("HEADLINES:\n{lines}\n".format(lines=raw[:5]))
print("LENGTH:\n{length}\n".format(length=len(raw)))

HEADLINES:
['Five new running shoes that aim to go the extra mile http://lat.ms/1ELp3wU', 'Gym Rat: Disq class at Crunch is intense workout on pulley system http://lat.ms/1EKOFdr', 'Noshing through thousands of ideas at Natural Products Expo West http://lat.ms/1EHqywg', 'Natural Products Expo also explores beauty, supplements and more http://lat.ms/1EHqyfE', 'Free Fitness Weekends in South Bay beach cities aim to spark activity http://lat.ms/1EH3SMC']

LENGTH:
4171



In [8]:
# define function for checking language of tweets
# filter to english only

def do_language_identifying(txt):
    try:
       the_language = langdetect.detect(txt)
    except:
       the_language = 'none'
    return the_language

In [9]:
# define function to perform lemmatization

def do_lemmatizing(wrd):
    out = nltk.corpus.wordnet.morphy(wrd)
    return (wrd if out is None else out)

In [10]:
# define function to cleaning tweet data

def do_tweet_cleaning(txt):
    # identify language of tweet
    # return null if language not english
    lg = do_language_identifying(txt)
    if lg != 'en':
        return None
    
    # split the string on whitespace
    out = txt.split(' ')
    
    # identify screen names
    # replace with SCREENNAME
    out = ['SCREENNAME' if i.startswith('@') else i for i in out]
    
    # identify urls
    # replace with URL
    out = [
        'URL' if bool(regex.search('http[s]?://', i)) 
        else i for i in out
    ]
    
    # remove all punctuation
    out = [regex.sub('[^\\w\\s]|\n', '', i) for i in out]
    
    # make all non-keywords lowercase
    keys = ['SCREENNAME', 'URL']
    out = [i.lower() if i not in keys else i for i in out]
    
    # remove keywords
    out = [i for i in out if i not in keys]
    
    # remove stopwords
    list_stop_words = nltk.corpus.stopwords.words('english')
    list_stop_words = [regex.sub('[^\\w\\s]', '', i) for i in list_stop_words]
    
    out = [i for i in out if i not in list_stop_words]
    
    # lemmatizing
    out = [do_lemmatizing(i) for i in out]
    
    # keep words 4 or more characters long
    out = [i for i in out if len(i) >= 5]
    
    return out

In [11]:
# apply cleaning function to every tweet

clean = list(map(do_tweet_cleaning, raw))

In [12]:
# remove none types

clean = list(filter(None.__ne__, clean))
print("HEADLINES:\n{lines}\n".format(lines=clean[:5]))
print("LENGTH:\n{length}\n".format(length=len(clean)))

HEADLINES:
[['running', 'shoes', 'extra'], ['class', 'crunch', 'intense', 'workout', 'pulley', 'system'], ['thousand', 'natural', 'product'], ['natural', 'product', 'explore', 'beauty', 'supplement'], ['fitness', 'weekend', 'south', 'beach', 'spark', 'activity']]

LENGTH:
4093



In [13]:
# turn tokens back into strings
# concatenate using whitespaces

clean_sentences = [" ".join(i) for i in clean]

In [14]:
print(clean_sentences[0:10])



In [None]:
# Activity 7.01 Unit Test

def unittest_activity_7_01(predicted):
    # testing presence of df
    try:
        df
    except NameError:
        print("No dataframe present.")
    
    # testing expected length of clean sentences
    actualCnt = 4093
    predictedCnt = len(predicted)
    assert actualCnt == predictedCnt, "List lengths not equal."

unittest_activity_7_01(predicted=clean_sentences)

#### Activity 7.02

In [16]:
# define global variables

number_words = 10
number_docs = 10
number_features = 1000

In [17]:
# bag of words conversion
# count vectorizer (raw counts)

vectorizer1 = sklearn.feature_extraction.text.CountVectorizer(
    analyzer="word",
    max_df=0.95, 
    min_df=10, 
    max_features=number_features
)
clean_vec1 = vectorizer1.fit_transform(clean_sentences)
print(clean_vec1[0])

feature_names_vec1 = vectorizer1.get_feature_names()

  (0, 320)	1


In [18]:
# define function to calculate perplexity based on number of topics

def perplexity_by_ntopic(data, ntopics):
    output_dict = {
        "Number Of Topics": [], 
        "Perplexity Score": []
    }
    
    for t in ntopics:
        lda = sklearn.decomposition.LatentDirichletAllocation(
            n_components=t,
            learning_method="online",
            random_state=0
        )
        lda.fit(data)
        
        output_dict["Number Of Topics"].append(t)
        output_dict["Perplexity Score"].append(lda.perplexity(data))
        
    output_df = pandas.DataFrame(output_dict)
    
    index_min_perplexity = output_df["Perplexity Score"].idxmin()
    output_num_topics = output_df.loc[
        index_min_perplexity,  # index
        "Number Of Topics"  # column
    ]
        
    return (output_df, output_num_topics)

In [19]:
# execute function on vector of numbers of topics
# takes several minutes

df_perplexity, optimal_num_topics = perplexity_by_ntopic(
    clean_vec1, 
    ntopics=[i for i in range(1, 21) if i % 2 == 0]
)

In [20]:
print(df_perplexity)

   Number Of Topics  Perplexity Score
0                 2        350.450274
1                 4        400.851077
2                 6        426.428279
3                 8        462.129327
4                10        473.725037
5                12        480.092033
6                14        493.971335
7                16        503.821238
8                18        518.832303
9                20        523.589597


In [21]:
# define and fit lda model

lda = sklearn.decomposition.LatentDirichletAllocation(
    n_components=optimal_num_topics,
    learning_method="online",
    random_state=0
)
lda.fit(clean_vec1)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=2, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [22]:
# define function to format raw output into nice tables

def get_topics(mod, vec, names, docs, ndocs, nwords):
    # word to topic matrix
    W = mod.components_
    W_norm = W / W.sum(axis=1)[:, numpy.newaxis]
    # topic to document matrix
    H = mod.transform(vec)
    
    W_dict = {}
    H_dict = {}
    
    for tpc_idx, tpc_val in enumerate(W_norm):
        topic = "Topic{}".format(tpc_idx)
        
        # formatting w
        W_indices = tpc_val.argsort()[::-1][:nwords]
        W_names_values = [
            (round(tpc_val[j], 4), names[j]) 
            for j in W_indices
        ]
        W_dict[topic] = W_names_values
        
        # formatting h
        H_indices = H[:, tpc_idx].argsort()[::-1][:ndocs]
        H_names_values = [
            (round(H[:, tpc_idx][j], 4), docs[j]) 
            for j in H_indices
        ]
        H_dict[topic] = H_names_values
        
    W_df = pandas.DataFrame(
        W_dict, 
        index=["Word" + str(i) for i in range(nwords)]
    )
    H_df = pandas.DataFrame(
        H_dict,
        index=["Doc" + str(i) for i in range(ndocs)]
    )
        
    return (W_df, H_df)

In [23]:
# get nice tables

W_df, H_df = get_topics(
    mod=lda,
    vec=clean_vec1,
    names=feature_names_vec1,
    docs=raw,
    ndocs=number_docs, 
    nwords=number_words
)

In [24]:
# word-topic table

print(W_df)

                     Topic0                Topic1
Word0      (0.0405, latfit)         (0.05, study)
Word1        (0.034, study)      (0.0317, cancer)
Word2      (0.0325, health)     (0.0226, patient)
Word3      (0.0233, people)       (0.0179, death)
Word4       (0.0196, could)       (0.0172, heart)
Word5       (0.0186, brain)     (0.0154, disease)
Word6  (0.0175, researcher)   (0.015, healthcare)
Word7       (0.0173, woman)      (0.0148, weight)
Word8   (0.0143, scientist)  (0.0147, california)
Word9    (0.0133, american)     (0.0128, medical)


In [25]:
# document-topic table

print(H_df)

                                                 Topic0  \
Doc0  (0.9442, Want your legs to look good in those ...   
Doc1  (0.9373, RT @aminawrite: This little boy was b...   
Doc2  (0.9373, Are humans wired to lie? In some situ...   
Doc3  (0.9336, We're all in the clean-plate club, re...   
Doc4  (0.9334, Spend time with dad this Father’s Day...   
Doc5  (0.9315, RT @latimesscience: Bigger testicles ...   
Doc6  (0.9304, Alcohol in movies linked to binge dri...   
Doc7  (0.9284, Who is most sleep-deprived in America...   
Doc8  (0.9284, RT @lisagirion: CA Med Board, under f...   
Doc9  (0.9284, Colorectal cancers are falling among ...   

                                                 Topic1  
Doc0  (0.9497, Study of oil spill effects on tuna in...  
Doc1  (0.9443, Have you suspected that stress makes ...  
Doc2  (0.9412, Doctors often delay vaccines for youn...  
Doc3  (0.9402, Does your dog know how you're feeling...  
Doc4  (0.9354, Global warming may revive all kinds o...  
Do

In [26]:
# iteractive plot
# pca biplot and histogram

lda_plot = pyLDAvis.sklearn.prepare(lda, clean_vec1, vectorizer1, R=10)
pyLDAvis.display(lda_plot)

In [None]:
# Activity 7.02 Unit Test

def unittest_activity_7_02(num_topics):
    # testing optimal number of topics
    assert num_topics == 2, "Number of optimal topics wrong."
    
    # testing presence of lda model
    try:
        lda
    except:
        print("No lda model defined.")
        
    # testing presence of W_df
    try:
        W_df
    except:
        print("No W_df defined.")
        
    # testing presence of H_df
    try:
        H_df
    except:
        print("No H_df defined.")

unittest_activity_7_02(num_topics=optimal_num_topics)

#### Activity 7.03

In [28]:
# bag of words conversion
# tf-idf method

vectorizer2 = sklearn.feature_extraction.text.TfidfVectorizer(
    analyzer="word",
    max_df=0.5, 
    min_df=20, 
    max_features=number_features,
    smooth_idf=False
)
clean_vec2 = vectorizer2.fit_transform(clean_sentences)
print(clean_vec2[0])

feature_names_vec2 = vectorizer2.get_feature_names()




In [29]:
# define and fit nmf model

nmf = sklearn.decomposition.NMF(
    n_components=optimal_num_topics,
    init="nndsvda",
    solver="mu",
    beta_loss="frobenius",
    random_state=0, 
    alpha=0.1, 
    l1_ratio=0.5
)
nmf.fit(clean_vec2)

NMF(alpha=0.1, beta_loss='frobenius', init='nndsvda', l1_ratio=0.5,
    max_iter=200, n_components=2, random_state=0, shuffle=False, solver='mu',
    tol=0.0001, verbose=0)

In [30]:
# get nicely formatted result tables

W_df, H_df = get_topics(
    mod=nmf,
    vec=clean_vec2,
    names=feature_names_vec2,
    docs=raw,
    ndocs=number_docs, 
    nwords=number_words
)

In [31]:
# word-topic table

print(W_df)

                  Topic0                Topic1
Word0    (0.3724, study)      (0.5947, latfit)
Word1   (0.0259, cancer)       (0.0483, steps)
Word2   (0.0208, people)       (0.0444, today)
Word3   (0.0185, health)      (0.04, exercise)
Word4    (0.0184, brain)  (0.0272, healthtips)
Word5  (0.0184, obesity)     (0.0257, workout)
Word6  (0.0175, suggest)      (0.022, fitness)
Word7   (0.0167, weight)     (0.0202, getting)
Word8    (0.0159, woman)       (0.0142, great)
Word9     (0.014, could)     (0.0131, morning)


In [32]:
# document-topic table

print(H_df)

                                                 Topic0  \
Doc0  (0.2028, Knot Yet: Getting married later can h...   
Doc1  (0.2028, RT @latimesscience: Estrogen helps fe...   
Doc2  (0.2028, Resveratrol's anti-aging potential ge...   
Doc3  (0.2028, Self-injury: Even little boys and gir...   
Doc4  (0.2028, Study: Annual PSA screening doesn't r...   
Doc5  (0.2028, Survey by @UCSF researchers finds no ...   
Doc6  (0.2028, Cardio doesn't have to be dull — try ...   
Doc7  (0.2028, Ruling out race in college admissions...   
Doc8  (0.2028, USADA chief previews Lance Armstrong ...   
Doc9  (0.2028, The sanitary benefits of fist-bumping...   

                                                 Topic1  
Doc0  (0.2272, RT @annagorman: One more from @AmerMe...  
Doc1  (0.2272, Doctors find a simple way to manage h...  
Doc2  (0.2272, RT @LATerynbrown: ER survey: Nearly 1...  
Doc3  (0.2272, Volunteers wanted for cancer research...  
Doc4  (0.2272, RT @annagorman: Hospital's ban on abo...  
Do

In [None]:
# Activity 7.03 Unit Test

def unittest_activity_7_03():
    # testing presence of lda model
    try:
        nmf
    except:
        print("No lda model defined.")
        
    # testing presence of W_df
    try:
        W_df
    except:
        print("No W_df defined.")
        
    # testing presence of H_df
    try:
        H_df
    except:
        print("No H_df defined.")

unittest_activity_7_03()