In [None]:
import graphlab as gl
import numpy as np

In [2]:
msf = gl.load_sframe('../data/kindle_data.sf/')

This non-commercial license of GraphLab Create for academic use is assigned to gsimmons17@gsb.columbia.edu and will expire on December 07, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1493066255.log


In [3]:
msf.column_names()

['asin',
 'overall',
 'reviewText',
 'reviewTime',
 'reviewerID',
 'reviewerName',
 'summary',
 'unixReviewTime',
 'upvotes',
 'downvotes',
 'tfidf',
 'brand',
 'categories',
 'description',
 'imUrl',
 'price',
 'related',
 'salesRank',
 'title']

Get an SArray of the concatenated text in the `summary`, `reviewText`, and `description` fields.

In [21]:
docs = msf.apply(lambda x: str(x['summary']) + ' ' + str(x['reviewText']) + ' ' + str(x['description']))

Create a function to count words from a `docs` SArray that outputs a `docs_sf` SFrame with associated word counts

In [26]:
def get_word_frequency(docs):
    """
    Returns the frequency of occurrence of words in an SArray of documents
    Args:
    docs: An SArray (of dtype str) of documents
    Returns:
    An SFrame with the following columns:
     'word'      : Word used
     'count'     : Number of times the word occured in all documents.
     'frequency' : Relative frequency of the word in the set of input documents.
    """

    # Use the count_words function to count the number of words.
    docs_sf = gl.SFrame()
    docs_sf['words'] = gl.text_analytics.count_words(docs)

    # Stack the dictionary into individual word-count pairs.
    docs_sf = docs_sf.stack('words', 
                         new_column_name=['word', 'count'])

    # Count the number of unique words (remove None values)
    docs_sf = docs_sf.groupby('word', {'count': gl.aggregate.SUM('count')})
    docs_sf['frequency'] = docs_sf['count'] / docs_sf["count"].sum()
    return docs_sf

In [None]:
docs_sf = get_word_frequency(docs)

In [None]:
def predict(document_bow, word_topic_counts, topic_counts, vocab,
            alpha=0.1, beta=0.01, num_burnin=5):
    """
    Make predictions for a single document.
    Parameters
    ----------
    document_bow : dict
        Dictionary with words as keys and document frequencies as counts.
    word_topic_counts : numpy array, num_vocab x num_topics
        Number of times a given word has ever been assigned to a topic.
    topic_counts : numpy vector of length num_topics
        Number of times any word has been assigned to a topic.
    vocab : dict
        Words are keys and unique integer is the value.
    alpha : float
        Hyperparameter. See topic_model docs.
    beta : float
        Hyperparameter. See topic_model docs.
    num_burnin : int
        Number of iterations of Gibbs sampling to perform at predict time.
    Returns
    -------
    out : numpy array of length num_topics
        Probabilities that the document belongs to each topic.
    """
    num_vocab, num_topics = word_topic_counts.shape

    # proportion of each topic in this test doc
    doc_topic_counts = np.zeros(num_topics)
    # Assignment of each unique word
    doc_topic_assignments = []

    # Initialize assignments and counts
    # NB: we are assuming document_bow doesn't change.
    for i, (word, freq) in enumerate(document_bow.iteritems()):
        if word not in vocab:  # skip words not present in training set
            continue
        topic = np.random.randint(0, num_topics-1)
        doc_topic_assignments.append(topic)
        doc_topic_counts[topic] += freq

    # Sample topic assignments for the test document
    for burnin in range(num_burnin):
        for i, (word, freq) in enumerate(document_bow.iteritems()):
            if word not in vocab:
                continue
            word_id = vocab[word]

            # Get old topic and decrement counts
            topic = doc_topic_assignments[i]
            doc_topic_counts[topic] -= freq

            # Sample a new topic
            gamma = np.zeros(num_topics)  # store probabilities
            for k in range(num_topics):
                gamma[k] = (doc_topic_counts[k] + alpha) * (word_topic_counts[word_id, k] + beta) / (topic_counts[k] + num_vocab * beta)
            gamma = gamma / gamma.sum()  # normalize to probabilities
            topic = np.random.choice(num_topics, 1, p=gamma)

            # Use new topic to increment counts
            doc_topic_assignments[i] = topic
            doc_topic_counts[topic] += freq

    # Create predictions
    predictions = np.zeros(num_topics)
    total_doc_topic_counts = doc_topic_counts.sum()
    for k in range(num_topics):
        predictions[k] = (doc_topic_counts[k] + alpha) / (total_doc_topic_counts + num_topics * alpha)
    return predictions / predictions.sum()


    if __name__ == '__main__':
    docs = gl.SFrame({'text': [{'first': 5, 'doc': 1}, {'second': 3, 'doc': 5}]})
    m = gl.topic_model.create(docs)

    # Get test document in bag of words format
    document_bow = docs['text'][0]

    # Input: Global parameters from trained model

    # Number of times each word in the vocabulary has ever been assigned to topic k (in any document). You can make an approximate version of this by multiplying m['topics'] by some large number (e.g. number of tokens in corpus) that indicates how strong you "believe" in these topics. Make it into counts by flooring it to an integer.
    prior_strength = 1000000
    word_topic_counts = np.array(m['topics']['topic_probabilities'])
    word_topic_counts = np.floor(prior_strength * word_topic_counts)

    # Number of times any word as been assigned to each topic.
    topic_counts = word_topic_counts.sum(0)

    # Get vocabulary lookup
    num_topics = m['num_topics']
    vocab = {}
    for i, w in enumerate(m['topics']['vocabulary']):
        vocab[w] = i
    num_vocab = len(vocab)

    # Make prediction on test document
    probs = predict(document_bow, word_topic_counts, topic_counts, vocab)

In [16]:
print msf[1]['summary']
print msf[1]['reviewText']
print msf[1]['description']

Okay for true beginners
So, I bought this book a few days ago and have tried three recipes so far.  The first was a total flop.  There must be an error, but be forewarned, do NOT make the Blueberry Coffee Cake as it comes out as inedible mush--WAY too much water.  The other two recipes (mac and cheese and grilled cheese with tomato) were decent for quick lunches or dinners.  They were average in taste, but considering the short amount of time it took to make them, I'm okay with that.  All in all, it's a nice idea book to get creative with everyday ingredients, but with errors and only average taste, I give it three stars.
In less time and for less money than it takes to order pizza, you can make it yourself!Three harried but heatlh-conscious college students compiled and tested this collection of more than 200 tasty, hearty, inexpensive recipes anyone can cook -- yes, anyone!Whether you're short on cash, fearful of fat, counting your calories, or just miss home cooking, The Healthy Col

In [8]:
wc = gl.text_analytics.count_words(docs, to_lower=True)

In [13]:
trimmer = gl.toolkits.feature_engineering.RareWordTrimmer(threshold=2)

# Fit and transform the data.
transformed_sf = trimmer.fit_transform(wc)

ToolkitError: Input data is not an SFrame. If it is a Pandas DataFrame, you may use the to_sframe() function to convert it to an SFrame.

In [11]:
len(wc)

3205467

In [None]:
# Use the count_words function to count the number of words.
docs_sf = gl.SFrame()
docs_sf['words'] = gl.text_analytics.count_words(docs)

# Stack the dictionary into individual word-count pairs.
docs_sf = docs_sf.stack('words', 
                     new_column_name=['word', 'count'])

# Count the number of unique words (remove None values)
docs_sf = docs_sf.groupby('word', {'count': gl.aggregate.SUM('count')})
docs_sf['frequency'] = docs_sf['count'] / docs_sf["count"].sum()