# Nearest Neighbors

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
import itertools
%matplotlib inline

In [2]:
wiki = pd.read_csv('people_wiki.csv')

In [3]:
wiki.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']    
    return csr_matrix( (data, indices, indptr), shape)

In [5]:
word_count = load_sparse_csr('people_wiki_word_count.npz')
map_index_to_word = pd.read_json('people_wiki_map_index_to_word.json',typ='series')

In [6]:
def count_words(X, voc):
    """
    X: the return matrix of CountVectorizer.transform
    voc : vect.vocabulary_
    """
    rvoc = dict((v, k) for k, v in voc.iteritems())

    def count(row_id):
        dic = dict()
        for ind in X[row_id, :].indices:
            dic[rvoc[ind]] = X[row_id, ind]
        return dic
    word_count = list(map(count, range(0, X.shape[0])))
    return word_count

In [7]:
word_counts = count_words(word_count,map_index_to_word)

In [8]:
wiki['word_count'] = word_counts

In [9]:
wiki.head()

Unnamed: 0,URI,name,text,word_count
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'brisbaneafter': 1, 'edflhe': 1, 'aflfrom': 1..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'maladaptation': 1, 'phasedelay': 1, '25hour'..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'germanyover': 1, 'bluesgospel': 1, 'harpdog'..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'fantasticrottensteiner': 1, 'waidmannsfeld':..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'arhm': 3, 'gangstergenka': 1, 'kuhnja': 1, '..."


## Find nearest neighbors

In [54]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric='euclidean', algorithm='brute')
model.fit(word_count)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [55]:
wiki[wiki['name'] == 'Barack Obama']

Unnamed: 0,URI,name,text,word_count
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'husen': 1, '2012obama': 1, 'laureateduring':..."


In [56]:
distances, indices = model.kneighbors(word_count[35817,:], n_neighbors=10)

In [57]:
neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
neighbors.set_index('id',inplace=True)
wiki.join(neighbors,how='right').nsmallest(10,'distance')[['name','distance']]

Unnamed: 0_level_0,name,distance
id,Unnamed: 1_level_1,Unnamed: 2_level_1
35817,Barack Obama,0.0
24478,Joe Biden,33.075671
28447,George W. Bush,34.394767
35357,Lawrence Summers,36.152455
14754,Mitt Romney,36.166283
13229,Francisco Barrio,36.331804
31423,Walter Mondale,36.400549
22745,Wynn Normington Hugh-Jones,36.496575
36364,Don Bonker,36.633318
9210,Andy Anstett,36.959437


In [79]:
def top_words(name):
    """
    Get a table of the most frequent words in the given person's wikipedia page.
    """
    row = wiki[wiki['name'] == name]
    word_count_table = pd.DataFrame({'word':list(row['word_count'].values[0].keys()),
                                    'count':list(row['word_count'].values[0].values())},)
    word_count_table.set_index('word',inplace = True)
    return word_count_table.sort_values('count', ascending=False)

In [81]:
obama_words = top_words('Barack Obama')
obama_words.head()

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
the,40
in,30
and,21
of,18
to,14


In [82]:
barrio_words = top_words('Francisco Barrio')
barrio_words.head()

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
the,36
of,24
and,18
in,17
he,10


In [83]:
combined_words = obama_words.join(barrio_words,
                                  how='inner',
                                 lsuffix='_obama',
                                 rsuffix='_barrio')
combined_words.head()

Unnamed: 0_level_0,count_obama,count_barrio
word,Unnamed: 1_level_1,Unnamed: 2_level_1
the,40,36
in,30,17
and,21,18
of,18,24
to,14,9


In [84]:
combined_words = combined_words.rename(columns={'count_obama':'Obama', 'count_barrio':'Barrio'})
combined_words.head()

Unnamed: 0_level_0,Obama,Barrio
word,Unnamed: 1_level_1,Unnamed: 2_level_1
the,40,36
in,30,17
and,21,18
of,18,24
to,14,9


In [85]:
combined_words.sort_values('Obama', ascending=False).head()

Unnamed: 0_level_0,Obama,Barrio
word,Unnamed: 1_level_1,Unnamed: 2_level_1
the,40,36
in,30,17
and,21,18
of,18,24
to,14,9


In [86]:
common_words = set(combined_words.sort_values('Obama', ascending=False)[0:5].index) # YOUR CODE HERE
def has_top_words(word_count_vector):
    # extract the keys of word_count_vector and convert it to a set
    unique_words = set(word_count_vector.keys())   # YOUR CODE HERE
    # return True if common_words is a subset of unique_words
    # return False otherwise
    return common_words.issubset(unique_words)  # YOUR CODE HERE

wiki['has_top_words'] = wiki['word_count'].apply(has_top_words)

# use has_top_words column to answer the quiz question
print(sum(wiki['has_top_words'])) # YOUR CODE HERE

56066


In [88]:
print('Output from your function:', has_top_words(wiki.iloc[32]['word_count']))
print('Correct output: True')
print('Also check the length of unique_words. It should be 167')

Output from your function: True
Correct output: True
Also check the length of unique_words. It should be 167


In [89]:
print('Output from your function:', has_top_words(wiki.iloc[33]['word_count']))
print('Correct output: False')
print('Also check the length of unique_words. It should be 188')

Output from your function: False
Correct output: False
Also check the length of unique_words. It should be 188


In [90]:
from sklearn.metrics.pairwise import euclidean_distances

In [91]:
word_count[wiki[wiki['name']=='Barack Obama'].index.values[0],:]

<1x547979 sparse matrix of type '<class 'numpy.int64'>'
	with 273 stored elements in Compressed Sparse Row format>

In [92]:
print(euclidean_distances(word_count[wiki[wiki['name']=='Barack Obama'].index.values[0],:],word_count[wiki[wiki['name']=='George W. Bush'].index.values[0],:]))
print(euclidean_distances(word_count[wiki[wiki['name']=='Barack Obama'].index.values[0],:],word_count[wiki[wiki['name']=='Joe Biden'].index.values[0],:]))
print(euclidean_distances(word_count[wiki[wiki['name']=='Joe Biden'].index.values[0],:],word_count[wiki[wiki['name']=='George W. Bush'].index.values[0],:]))

[[34.39476704]]
[[33.07567082]]
[[32.75667871]]


In [93]:
bush_words = top_words('George W. Bush')
combined_words = obama_words.join(bush_words, how='inner',rsuffix='.1')
combined_words.sort_values('count',ascending = False)[0:10]

Unnamed: 0_level_0,count,count.1
word,Unnamed: 1_level_1,Unnamed: 2_level_1
the,40,39
in,30,22
and,21,14
of,18,14
to,14,11
his,11,6
act,8,3
he,7,8
a,7,6
law,6,1


# TF-IDF to the rescue

In [94]:
tf_idf = load_sparse_csr('people_wiki_tf_idf.npz')

In [95]:
tf_idfs = count_words(tf_idf, map_index_to_word)

In [96]:
wiki['tf_idf'] = tf_idfs

In [97]:
model_tf_idf = NearestNeighbors(metric='euclidean', algorithm='brute')
model_tf_idf.fit(tf_idf)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [98]:
distances, indices = model_tf_idf.kneighbors(tf_idf[35817], n_neighbors=10)

In [99]:
neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
neighbors.set_index('id',inplace=True)
wiki.join(neighbors, how='right').nsmallest(10,'distance')[[ 'name', 'distance']]

Unnamed: 0_level_0,name,distance
id,Unnamed: 1_level_1,Unnamed: 2_level_1
35817,Barack Obama,0.0
7914,Phil Schiliro,106.861014
46811,Jeff Sessions,108.871674
44681,Jesse Lee (politician),109.045698
38376,Samantha Power,109.108106
6507,Bob Menendez,109.781867
38714,Eric Stern (politician),109.957788
44825,James A. Guest,110.413889
44368,Roland Grossenbacher,110.470609
33417,Tulsi Gabbard,110.696998
