In [1]:
# /content/drive/MyDrive/Practice Notebook/Document Retrieval/people_wiki.csv

In [2]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Practice Notebook/Document Retrieval/people_wiki.csv')
df.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
def word_count(string):
  count = {}
  for word in string.strip().split():
    count[word] = count.get(word, 0) + 1
  return count

In [5]:
obama = df[df['name'] == 'Barack Obama']
obama_word_count_values = obama['text'].map(word_count)
obama_word_count_table = pd.DataFrame.from_dict(obama_word_count_values.values[0], orient = 'index', columns = ['count']).sort_values('count', ascending = False)
obama_word_count_table.head()

Unnamed: 0,count
the,40
in,30
and,21
of,18
to,14


In [6]:
df['word_count'] = df['text'].map(word_count)
df.head()

Unnamed: 0,URI,name,text,word_count
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'digby': 1, 'morrell': 5, 'born': 1, '10': 1,..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'alfred': 1, 'j': 1, 'lewy': 3, 'aka': 1, 'sa..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'harpdog': 2, 'brown': 2, 'is': 7, 'a': 7, 's..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'franz': 1, 'rottensteiner': 3, 'born': 1, 'i..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'henry': 1, 'krvits': 1, 'born': 1, '30': 1, ..."


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(stop_words = 'english')
df_tf_idf_values = tf_idf_vectorizer.fit_transform(df['text'])

In [14]:
tf_idf_values = tf_idf_vectorizer.idf_
tf_idf_vocab = tf_idf_vectorizer.vocabulary_

obama_tf_idf_values = {k: count * tf_idf_values[tf_idf_vocab[k]] for k, count in obama_word_count_values.values[0].items() if k in tf_idf_vocab}
obama_tf_idf_table = pd.DataFrame.from_dict(obama_tf_idf_values, orient = 'index', columns = ['count']).sort_values('count', ascending = False) 
obama_tf_idf_table.head()                                                                                    

Unnamed: 0,count
obama,52.277114
act,35.674051
iraq,21.741728
law,20.721856
control,18.88433


In [22]:
tf_idf_vectorizer.idf_

array([ 7.75240581,  7.82762923, 11.29336514, ..., 11.29336514,
       10.88790003, 11.29336514])

In [15]:
clinton = df[df['name'] == 'Bill Clinton']
beckham = df[df['name'] == 'David Beckham']

In [17]:
obama_tf_idf = tf_idf_vectorizer.transform(obama['text'])[0]
clintona_tf_idf = tf_idf_vectorizer.transform(clinton['text'])[0]
beckham_tf_idf = tf_idf_vectorizer.transform(beckham['text'])[0]

In [18]:
from sklearn.metrics.pairwise import cosine_distances
cosine_distances(obama_tf_idf, clintona_tf_idf)

array([[0.81103282]])

In [19]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors().fit(df_tf_idf_values)

In [20]:
n_dist, n_ids = knn.kneighbors(obama_tf_idf, return_distance = True)
n_ids, n_dist = n_ids[0], n_dist[0]
n_name = df['name'].iloc[n_ids]

neighbors = pd.DataFrame.from_dict({'distance': n_dist, 'name': n_name}).sort_values('distance', ascending = True)
neighbors.head()

Unnamed: 0,distance,name
35817,0.0,Barack Obama
24478,1.165145,Joe Biden
38376,1.207369,Samantha Power
57108,1.21964,Hillary Rodham Clinton
38714,1.222509,Eric Stern (politician)
