In [17]:
import graphlab
people = graphlab.SFrame('people_wiki.gl/')
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people['tfidf_value'] = graphlab.text_analytics.tf_idf(people['word_count'])

# Compare top words according to word counts to TF-IDF

In [18]:
elton_john = people[people['name'] == 'Elton John']

In [19]:
answer1 = [elton_john.stack('word_count', new_column_name=['word','count']).sort('count',ascending=False)['word'][0:3],
           elton_john.stack('tfidf_value', new_column_name=['word','tfidf']).sort('tfidf',ascending=False)['word'][0:3]]

# Measuring distance

In [20]:
victoria_beckham = people[people['name'] == 'Victoria Beckham']
paul_mccartney = people[people['name'] == 'Paul McCartney']

In [29]:
victoria_distance = graphlab.distances.cosine(elton_john['tfidf_value'][0], victoria_beckham['tfidf_value'][0])
paul_distance = graphlab.distances.cosine(elton_john['tfidf_value'][0], paul_mccartney['tfidf_value'][0])

answer2 = [victoria_distance, paul_distance]
if victoria_distance < paul_distance:
    answer2.append('Victoria Beckham')
else:
    answer2.append('Paul McCartney')

# Building nearest neighbors models with different input features and setting the distance metric

In [22]:
word_count_model = graphlab.nearest_neighbors.create(people, features=['word_count'], label='name', distance='cosine')
tfidf_model = graphlab.nearest_neighbors.create(people, features=['tfidf_value'], label='name', distance='cosine')

In [25]:
answer3 = [word_count_model.query(elton_john)['reference_label'][1],
           tfidf_model.query(elton_john)['reference_label'][1],
           word_count_model.query(victoria_beckham)['reference_label'][1],
           tfidf_model.query(victoria_beckham)['reference_label'][1]]

# Results

In [32]:
print 'Answer 1'
print '3 words with highest word counts:  ', answer1[0]
print '3 words with highest tfidf values: ', answer1[1]
print ''
print 'Answer 2'
print 'Cosine distance between Elton John and Victoria Beckham: ', answer2[0]
print 'Cosine distance between Elton John and Paul McCartney:   ', answer2[1]
print 'Closer to Elton John:                                    ', answer2[2]
print ''
print 'Answer 3'
print 'Most similar article, other than itself, to Elton John using word count features:       ', answer3[0]
print 'Most similar article, other than itself, to Elton John using TF-IDF features:           ', answer3[1]
print 'Most similar article, other than itself, to Victoria Beckham using word count features: ', answer3[2]
print 'Most similar article, other than itself, to Victoria Beckham using TF-IDF features:     ', answer3[3]

Answer 1
3 words with highest word counts:   ['the', 'in', 'and']
3 words with highest tfidf values:  ['furnish', 'elton', 'billboard']

Answer 2
Cosine distance between Elton John and Victoria Beckham:  0.956700637666
Cosine distance between Elton John and Paul McCartney:    0.825031002922
Closer to Elton John:                                     Paul McCartney

Answer 3
Most similar article, other than itself, to Elton John using word count features:        Cliff Richard
Most similar article, other than itself, to Elton John using TF-IDF features:            Rod Stewart
Most similar article, other than itself, to Victoria Beckham using word count features:  Mary Fitzgerald (artist)
Most similar article, other than itself, to Victoria Beckham using TF-IDF features:      David Beckham
