# Retrieving Wikipedia articles

In [1]:
import turicreate

## Load some text data from Wikipedia

In [34]:
people = turicreate.SFrame('../../data/people_wiki.sframe')

In [35]:
people

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


In [36]:
people['word_count'] = turicreate.text_analytics.count_words(people['text'])

## Exploring the Person Of Interest (POI) who is Elton John

In [37]:
poi = people[people['name'] == 'Elton John']

In [38]:
poi

URI,name,text,word_count
<http://dbpedia.org/resou rce/Elton_John> ...,Elton John,sir elton hercules john cbe born reginald ken ...,"{'movements': 1.0, 'social': 1.0, ..."


In [39]:
poi['word_count'] = turicreate.text_analytics.count_words(poi['text'])

In [40]:
poi

URI,name,text,word_count
<http://dbpedia.org/resou rce/Elton_John> ...,Elton John,sir elton hercules john cbe born reginald ken ...,"{'movements': 1.0, 'social': 1.0, ..."


##  Finding the 3 words in his articles with highest word counts

In [41]:
poi_word_count_table = poi[['word_count']].stack('word_count', new_column_name = ['word','count'])

In [42]:
poi_word_count_table.sort('count',ascending=False)

word,count
the,27.0
in,18.0
and,15.0
of,13.0
a,10.0
has,9.0
he,7.0
john,7.0
on,6.0
since,5.0


## Finding the 3 words in his articles with highest TF-IDF

### Find the TF-IDF for the entire corpus

In [43]:
people['tfidf'] = turicreate.text_analytics.tf_idf(people['text'])

In [44]:
people

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'melbourne': 1.0, 'parade': 1.0, ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'time': 1.0, 'each': 1.0, 'hour': 1.0, ..."
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'society': 1.0, 'hamilton': 1.0, 'to': ..."
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...,"{'kurdlawitzpreis': 1.0, 'awarded': 1.0, '2004': ..."
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'curtis': 1.0, '2007': 1.0, 'cent': 1.0, ..."
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...,"{'asses': 1.0, 'sic': 1.0, 'toilets': 1.0, ..."
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...,"{'streamz': 1.0, 'including': 1.0, ..."
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...,"{'concordia': 1.0, 'creative': 1.0, ..."
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...,"{'heavies': 1.0, 'new': 1.0, 'brand': 1.0, ..."
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...,"{'2002': 1.0, 'harvard': 1.0, 'twentieth': 1.0, ..."

tfidf
"{'melbourne': 3.8914310119380633, ..."
"{'time': 1.3253342074200498, ..."
"{'society': 2.4448047262085693, ..."
"{'kurdlawitzpreis': 10.986495389225194, ..."
"{'curtis': 5.299520032885375, ..."
"{'asses': 9.600201028105303, 's ..."
"{'streamz': 10.986495389225194, ..."
"{'concordia': 6.250296940830698, ..."
"{'heavies': 8.907053847545358, 'n ..."
"{'2002': 1.8753125887822302, ..."


## Examine the TF-IDF for the Elton John entry

In [45]:
poi = people[people['name'] == 'Elton John']

In [46]:
poi[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

word,tfidf
furnish,18.38947183999428
elton,17.482320270031995
billboard,17.30368095754203
john,13.93931279239831
songwriters,11.25040644703154
overallelton,10.986495389225194
tonightcandle,10.986495389225194
19702000,10.293348208665249
fivedecade,10.293348208665249
aids,10.262846934045534


## Compute the cosine distance between Elton John's, Paul McCartney's  and Victoria Beckham's articles

In [47]:
elton_john = people[people['name'] == 'Elton John']
victoria_beckham = people[people['name'] == 'Victoria Beckham']
paul_mccartney = people[people['name'] == 'Paul McCartney']

### Is Elton John closer to Paul McCartney or to Victoria Beckham?

In [48]:
elton_to_mccartney = turicreate.distances.cosine(elton_john['tfidf'][0],paul_mccartney['tfidf'][0])

In [49]:
elton_to_mccartney

0.8250310029221779

In [50]:
elton_to_beckham = turicreate.distances.cosine(elton_john['tfidf'][0],victoria_beckham['tfidf'][0])

In [51]:
elton_to_beckham

0.9567006376655429

## Building nearest neighbors model using word counts as features

In [52]:
knn_model = turicreate.nearest_neighbors.create(people,features=['word_count'],label='name', distance='cosine')

### Finding entries closest to Elton John using word counts as features

In [53]:
knn_model.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,2.220446049250313e-16,1
0,Cliff Richard,0.1614241525896703,2
0,Sandro Petrone,0.1682254275104111,3
0,Rod Stewart,0.168327165587061,4
0,Malachi O'Doherty,0.177315545978884,5


### Finding entries closest to Victoria Beckham using word counts as features

In [54]:
knn_model.query(victoria_beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.220446049250313e-16,1
0,Mary Fitzgerald (artist),0.2073070361150499,2
0,Adrienne Corri,0.2145097827875479,3
0,Beverly Jane Fry,0.2174664687407927,4
0,Raman Mundair,0.2176954749915048,5


## Building nearest neighbors model using TF-IDF counts as features

In [56]:
knn_model = turicreate.nearest_neighbors.create(people,features=['tfidf'],label='name', distance='cosine')

### Finding entries closest to Elton John using TF-IDF as features

In [57]:
knn_model.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,-2.220446049250313e-16,1
0,Rod Stewart,0.7172196678927374,2
0,George Michael,0.7476009989692848,3
0,Sting (musician),0.7476719544306141,4
0,Phil Collins,0.7511932487904706,5


### Finding entries closest to Victoria Beckham using TF-IDF as features

In [58]:
knn_model.query(victoria_beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.1102230246251563e-16,1
0,David Beckham,0.5481696102632145,2
0,Stephen Dow Beckham,0.7849867068283364,3
0,Mel B,0.8095855234085036,4
0,Caroline Rush,0.81982642291868,5
