#Document retrieval from wikipedia data

#Import pandas

In [2]:
import pandas as pd

#Load some text data - from wikipedia, pages on people

In [3]:
people = pd.read_csv('people_wiki.csv')

Data contains:  link to wikipedia article, name of person, text of article.

In [4]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [5]:
len(people)

59071

#Explore the dataset and checkout the text it contains

##Exploring the entry for president Obama

In [6]:
obama = people[people['name'] == 'Barack Obama']

In [7]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [8]:
obama['text'].values

array(['barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to 2004 he served three terms representing the 13th district in the illinois senate from 1997 to 2004 running unsuccessfully for the united states house of representatives in 2000in 2004 obama received national attention during his campaign to represent illinois in the united states senate with his victory in the march democratic party primary his keynote address at the democratic national convention in july and his election to the senate in november he began his presidential campaign in 2007 a

##Exploring the entry for actor George Clooney

In [9]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

38514    george timothy clooney born may 6 1961 is an a...
Name: text, dtype: object

#Get the word counts for Obama article

In [10]:
from collections import Counter
word_count = []
count = Counter(str(obama.text.values).split())
word_count.append(count)

In [11]:
obama['word_count'] = word_count
obama.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,URI,name,text,word_count
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'['barack': 1, 'hussein': 1, 'obama': 9, 'ii'..."


##Sort the word counts for the Obama article

###Turning dictonary of word counts into a table

In [12]:
obama_word_count_table = pd.DataFrame(sorted(count.items(),key=lambda pair: pair[1],reverse=True))

###Sorting the word counts to show most common words at the top

In [13]:
obama_word_count_table.head()

Unnamed: 0,0,1
0,the,40
1,in,30
2,and,21
3,of,18
4,to,14


Most common words include uninformative words like "the", "in", "and",... Doesn't have much meaning!!

#Compute TF-IDF for the corpus 

To give more weight to informative words, we weigh them by their TF-IDF scores.

In [14]:
freq_count = []
for item in people['text']:
    count = Counter(str(item).split())
    freq_count.append(count)
people['word_count'] = freq_count
people.head()

Unnamed: 0,URI,name,text,word_count
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'digby': 1, 'morrell': 5, 'born': 1, '10': 1,..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'alfred': 1, 'j': 1, 'lewy': 3, 'aka': 1, 'sa..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'harpdog': 2, 'brown': 2, 'is': 7, 'a': 7, 's..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'franz': 1, 'rottensteiner': 3, 'born': 1, 'i..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'henry': 1, 'krvits': 1, 'born': 1, '30': 1, ..."


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
tfidfVect = TfidfVectorizer()
tfidf = tfidfVect.fit_transform(people['text'])
tfidf

<59071x548429 sparse matrix of type '<class 'numpy.float64'>'
	with 10244028 stored elements in Compressed Sparse Row format>

In [57]:
print(tfidf)

  (0, 160652)	0.09377484096114971
  (0, 336162)	0.5151776851532417
  (0, 96612)	0.012817255065954401
  (0, 704)	0.03350953071413243
  (0, 359577)	0.029493401402676842
  (0, 11143)	0.036415320435667135
  (0, 259893)	0.010385827391862295
  (0, 203171)	0.023824215532529995
  (0, 72887)	0.11754176903095465
  (0, 427313)	0.048779520719894734
  (0, 202230)	0.04283717911908677
  (0, 530685)	0.019302526893852503
  (0, 386988)	0.07717627857142736
  (0, 533439)	0.06076489187161468
  (0, 488148)	0.2729092231853667
  (0, 272528)	0.2392918460443005
  (0, 58906)	0.04045701296072861
  (0, 111950)	0.12817477591058646
  (0, 251905)	0.13151423134101511
  (0, 202189)	0.3596344712497114
  (0, 293515)	0.12473936897966988
  (0, 48415)	0.11413890646314521
  (0, 529346)	0.04131355198210769
  (0, 72734)	0.039095887526818605
  (0, 239271)	0.03828243630336539
  :	:
  (59070, 104416)	0.058420868157660545
  (59070, 475409)	0.06540595261294063
  (59070, 138876)	0.05818940237670459
  (59070, 180421)	0.06839657537189

Count of occcurence of 'the' in the entire corpus

In [58]:
print(tfidfVect.vocabulary_.get('the'))

488148


##Examine the TF-IDF for the Obama article

In [16]:
obama = people[people['name'] == 'Barack Obama']

Find list of words in obama

In [17]:
for words in obama.word_count.items():
    print(set(words[1].elements()))
    obama_words = set(words[1].elements())

{'briefs', 'august', 'with', '2009', 'bin', 'us', 'illinois', 'marriage', '1997', 'into', 'laureateduring', 'recovery', 'second', 'current', 'policy', 'often', 'debate', 'promoted', 'represent', 'earning', '2008', 'stimulus', 'down', 'sought', 'constitutional', 'served', 'attorney', 'levels', 'californias', 'barack', 'nobel', 'operation', 'by', 'began', 'arms', 'january', 'included', 'tax', 'as', 'won', 'consumer', 'obama', 'protection', 'republican', 'first', 'creation', 'american', 'signed', 'tell', '2012obama', 'sworn', 'elementary', 'called', 'relations', 'act', 'after', 'job', 'foreign', 'ended', 'term', 'organizer', 'street', 'republicans', 'legislation', 'prize', 'insurance', '1996', 'taxpayer', 'withdrawal', 'sufficient', 'peace', 'sandy', 'march', 'equality', 'continued', 'office', 'unemployment', '2010', 'court', 'libya', 'delegates', 'attention', 'start', '2004', 'unsuccessfully', 'islamic', '44th', 'representing', 'degree', 'nations', 'two', 'university', 'total', 'victory'

In [18]:
obama

Unnamed: 0,URI,name,text,word_count
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'barack': 1, 'hussein': 1, 'obama': 9, 'ii': ..."


Get Tfidf for Obama's text..
fit with corpus and trasnform with obama's text

In [19]:
obama_tfidfVect = TfidfVectorizer()
obama_tfidfVect = obama_tfidfVect.fit(people['text'])
obama_tfidf = obama_tfidfVect.transform(obama['text'])

In [20]:
obama_tfidf.max()

0.3650175898187781

Vocabulary of the corpus with frequencies

In [None]:
obama_tfidfVect.vocabulary_
obama_tfidf_table = pd.DataFrame(sorted(obama_tfidfVect.vocabulary_.items(),key=lambda pair: pair[1],reverse=True))
obama_tfidf_table

In [None]:
print(obama_tfidf)

In [21]:
feature_names = obama_tfidfVect.get_feature_names()
for col in obama_tfidf.nonzero()[1]:
    print(feature_names[col], ' - ', obama_tfidf[0, col])

13th  -  0.04155223560328909
1961  -  0.03016581962187603
1992  -  0.02289189867875382
1996  -  0.021894640640149326
1997  -  0.021852752342990073
20  -  0.04806514415164882
2000in  -  0.05056331448649607
2004  -  0.056353284811914046
2007  -  0.017372040001768754
2008  -  0.01752122556256475
2009  -  0.053716110129215185
2010  -  0.036208432821067116
2011  -  0.0566046837038299
2012  -  0.0195067923551467
2012obama  -  0.07885433263257774
2013  -  0.020629112598581526
44th  -  0.05624071615493585
63  -  0.04341764575235663
act  -  0.2490890416206761
address  -  0.040499804202992096
administration  -  0.029988224032272276
affordable  -  0.04976117912769321
afghanistan  -  0.07971048613465608
african  -  0.03199054994063867
after  -  0.05430461125327095
against  -  0.021001902062722264
american  -  0.04456160767469354
americans  -  0.04021821930188391
and  -  0.14673880270062417
arms  -  0.04209027344177608
as  -  0.047221719116605276
ask  -  0.04542634533461067
at  -  0.016971534950574

Get highest ranking words in obama text using TF IDF

In [23]:
import numpy as np

In [24]:
feature_array = np.array(feature_names)
tfidf_sorting = np.argsort(obama_tfidf.toarray()).flatten()[::-1]
top_n = feature_array[tfidf_sorting][:10]
top_n

array(['obama', 'the', 'act', 'in', 'iraq', 'and', 'law', 'control', 'of',
       'us'], dtype='<U2140')

Words with highest TF-IDF are much more informative.

# #Build a nearest neighbor model for document retrieval

We now create a nearest-neighbors model and apply it to document retrieval.  

In [63]:
#from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

In [67]:
nbrs = NearestNeighbors(n_neighbors=10).fit(tfidf)

Find 10 nearest neighbours to Obama

In [69]:
distances, indices = nbrs.kneighbors(obama_tfidf)
distances,indices

(array([[0.        , 1.06843875, 1.1098956 , 1.11802815, 1.13996938,
         1.14776932, 1.1503744 , 1.15493924, 1.15776052, 1.15967833]]),
 array([[35817, 24478, 57108, 38376, 38714, 28447, 39357, 48693, 18827,
         46811]], dtype=int64))

In [71]:
names_similar = pd.Series(indices.flatten()).map(people.reset_index()['name'])
names_similar

0               Barack Obama
1                  Joe Biden
2     Hillary Rodham Clinton
3             Samantha Power
4    Eric Stern (politician)
5             George W. Bush
6                John McCain
7                Artur Davis
8               Henry Waxman
9              Jeff Sessions
dtype: object

In [76]:
result = pd.DataFrame({'distance':distances.flatten(), 'name':names_similar})
result[1:]

Unnamed: 0,distance,name
1,1.068439,Joe Biden
2,1.109896,Hillary Rodham Clinton
3,1.118028,Samantha Power
4,1.139969,Eric Stern (politician)
5,1.147769,George W. Bush
6,1.150374,John McCain
7,1.154939,Artur Davis
8,1.157761,Henry Waxman
9,1.159678,Jeff Sessions


#Applying the nearest-neighbors model for retrieval

##Who is closest to Obama?

As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.  