# Document retrieval

In [14]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load data

In [127]:
people = pd.read_csv('people_wiki.csv')

In [128]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


# Explore the dataset and checkout the text it contains

## Explore the entry for predisent Obama

In [17]:
obama = people[people['name']=='Barack Obama']

In [18]:
obama.text

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

# Get the word counts for Obama

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
count_vectorizer = CountVectorizer()

In [21]:
def count_words(s):
    dic = {}
    X = count_vectorizer.fit_transform([s]).toarray()[0]
    for word, index in count_vectorizer.vocabulary_.items():
        dic[word] = X[index]
    return dic

In [22]:
obama['word_count'] = obama['text'].apply(count_words)

In [43]:
obama

Unnamed: 0,URI,name,text,word_count
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'barack': 1, 'hussein': 1, 'obama': 9, 'ii': ..."


# Sort the word counts for the Obama article

## Turning dictionary of word counts into a table

In [58]:
obama_word_count_table = pd.DataFrame(list(obama['word_count'].values[0].items()), columns=['word', 'count'])
obama_word_count_table.sort_values(by='count', ascending=False).head(10)

Unnamed: 0,word,count
11,the,40
25,in,30
13,and,21
16,of,18
22,to,14
45,his,11
2,obama,9
137,act,8
35,he,7
37,as,6


# Compute the TF-IDF for the corpus

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
tfidf_vectorizer = TfidfVectorizer()

In [61]:
tfidf_vectorizer.fit_transform(people.text)

<59071x548429 sparse matrix of type '<class 'numpy.float64'>'
	with 10244028 stored elements in Compressed Sparse Row format>

In [75]:
obama_tfidf = pd.DataFrame([tfidf_vectorizer.get_feature_names(), list(tfidf_vectorizer.transform(obama.text).toarray()[0])]).T

In [79]:
obama_tfidf.sort_values(by=1, ascending=False).head()

Unnamed: 0,0,1
358557,obama,0.365018
488148,the,0.279323
45073,act,0.249089
251905,in,0.209673
259220,iraq,0.151809


# Manually compute distances between a few people

In [80]:
clinton = people[people['name']=='Bill Clinton']
beckham = people[people['name']=='David Beckham']

In [81]:
from sklearn.metrics.pairwise import cosine_distances

In [82]:
cosine_distances(tfidf_vectorizer.transform(beckham.text), tfidf_vectorizer.transform(obama.text))

array([[0.8420454]])

In [83]:
cosine_distances(tfidf_vectorizer.transform(clinton.text), tfidf_vectorizer.transform(obama.text))

array([[0.67497775]])

# Build a nearest neighbor model for document retrieval

In [84]:
from sklearn.neighbors import KNeighborsClassifier

In [85]:
knn_model = KNeighborsClassifier(algorithm='brute', metric='cosine')

In [86]:
knn_model.fit(tfidf_vectorizer.transform(people.text), people.name)

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

# Applying the nearest-neighbors model for retrieval

## Who is closest to Obama

In [87]:
dist, ind = knn_model.kneighbors(tfidf_vectorizer.transform(obama.text))

In [100]:
res = pd.DataFrame({'name':people.iloc[ind[0], 1], 'distance': dist[0]})
res

Unnamed: 0,distance,name
35817,0.0,Barack Obama
24478,0.570781,Joe Biden
57108,0.615934,Hillary Rodham Clinton
38376,0.624993,Samantha Power
38714,0.649765,Eric Stern (politician)


# Question 1

In [102]:
countvec = count_vectorizer.transform(people[people['name'] =='Elton John'].text)

In [103]:
pd.DataFrame({'word': count_vectorizer.get_feature_names(), 'count': countvec.toarray()[0]}).sort_values(by='count', ascending=False).head()

Unnamed: 0,count,word
242,27,the
115,18,in
28,15,and
162,13,of
102,9,has


# Question 2

In [104]:
countvec = tfidf_vectorizer.transform(people[people['name'] =='Elton John'].text)
pd.DataFrame({'word': tfidf_vectorizer.get_feature_names(), 'count': countvec.toarray()[0]}).sort_values(by='count', ascending=False).head()

Unnamed: 0,count,word
488148,0.243684,the
89663,0.192207,billboard
267446,0.188958,john
178605,0.184686,elton
208739,0.181221,furnish


# Question 3

In [105]:
cosine_distances(tfidf_vectorizer.transform(beckham.text), tfidf_vectorizer.transform(people[people['name']=='Elton John'].text))

array([[0.82385288]])

# Question 4

In [106]:
cosine_distances(tfidf_vectorizer.transform(people[people['name']=='Elton John'].text), tfidf_vectorizer.transform(people[people['name']=='Paul McCartney'].text))

array([[0.69231325]])

# Question 6

In [107]:
word_count_knn = KNeighborsClassifier()

In [108]:
word_count_knn.fit(count_vectorizer.transform(people.text), people.name)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [120]:
dist, ind = word_count_knn.kneighbors(count_vectorizer.transform(people[people['name']=='Elton John'].text), n_neighbors=people['name'].count())
res = pd.DataFrame({'name':people.iloc[ind[0], 1], 'distance': dist[0]})
res.set_index('name', inplace=True)
res.loc[['Billy Joel', 'Cliff Richard', 'Roger Daltrey', 'George Bush']]

Unnamed: 0_level_0,distance
name,Unnamed: 1_level_1
Billy Joel,21.794495
Cliff Richard,19.104973
Roger Daltrey,16.970563
George Bush,


# Question 7

In [121]:
dist, ind = knn_model.kneighbors(tfidf_vectorizer.transform(people[people['name']=='Elton John'].text), n_neighbors=people['name'].count())
res = pd.DataFrame({'name':people.iloc[ind[0], 1], 'distance': dist[0]})
res.set_index('name', inplace=True)
res.loc[['Rod Stewart', 'Tommy Haas', 'Roger Daltrey', 'Elvis Presley']]

Unnamed: 0_level_0,distance
name,Unnamed: 1_level_1
Rod Stewart,0.589361
Tommy Haas,0.781451
Roger Daltrey,0.686903
Elvis Presley,


# Question 8

In [126]:
dist, ind = word_count_knn.kneighbors(count_vectorizer.transform(people[people['name']=='Victoria Beckham'].text), n_neighbors=people['name'].count())
res = pd.DataFrame({'name':people.iloc[ind[0], 1], 'distance': dist[0]})
res.set_index('name', inplace=True)
res.loc[['Stephen Dow Beckham', 'Louis Molloy', 'Adrience Corri', 'Mary Fitzgerald(artist)']]

Unnamed: 0_level_0,distance
name,Unnamed: 1_level_1
Stephen Dow Beckham,12.489996
Louis Molloy,13.190906
Adrience Corri,
Mary Fitzgerald(artist),


# Question 9

In [125]:
dist, ind = knn_model.kneighbors(tfidf_vectorizer.transform(people[people['name']=='Victoria Beckham'].text), n_neighbors=people['name'].count())
res = pd.DataFrame({'name':people.iloc[ind[0], 1], 'distance': dist[0]})
res.set_index('name', inplace=True)
res.loc[['Mel B', 'Caroline Rush', 'David Beckham', 'Carrie Reichardt']]

Unnamed: 0_level_0,distance
name,Unnamed: 1_level_1
Mel B,0.718422
Caroline Rush,0.763821
David Beckham,0.546477
Carrie Reichardt,0.859191
