In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.feature_extraction import DictVectorizer
import scipy
from sklearn.neighbors import KNeighborsRegressor

# Load some text data - from wikipedia, pages on people

In [6]:
people = pd.read_csv("people_wiki.csv")
print(len(people))
people.head(5)

59071


Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


# Explore the dataset and checkout the text it contains

In [7]:
obama = people[people["name"] == "Barack Obama"]
obama["text"].iloc[0]

'barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to 2004 he served three terms representing the 13th district in the illinois senate from 1997 to 2004 running unsuccessfully for the united states house of representatives in 2000in 2004 obama received national attention during his campaign to represent illinois in the united states senate with his victory in the march democratic party primary his keynote address at the democratic national convention in july and his election to the senate in november he began his presidential campaign in 2007 and afte

In [8]:
clooney = people[people["name"] == "George Clooney"]
clooney["text"].iloc[0]

'george timothy clooney born may 6 1961 is an american actor writer producer director and activist he has received three golden globe awards for his work as an actor and two academy awards one for acting and the other for producingclooney made his acting debut on television in 1978 and later gained wide recognition in his role as dr doug ross on the longrunning medical drama er from 1994 to 1999 for which he received two emmy award nominations while working on er he began attracting a variety of leading roles in films including the superhero film batman robin 1997 and the crime comedy out of sight 1998 in which he first worked with a director who would become a longtime collaborator steven soderbergh in 1999 clooney took the lead role in three kings a wellreceived war satire set during the gulf warin 2001 clooneys fame widened with the release of his biggest commercial success the heist comedy oceans eleven the first of the film trilogy a remake of the 1960 film with frank sinatra as d

# Get word count for Obama article

In [9]:
def word_count(sentence):
    word_count_dict = dict(Counter(word.lower() for word in nltk.word_tokenize(str(sentence))))
    return word_count_dict

obama["word_count"] = obama["text"].apply(word_count)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [10]:
obama.head(5)

Unnamed: 0,URI,name,text,word_count
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'barack': 1, 'hussein': 1, 'obama': 9, 'ii': ..."


## Sort the word count for the Obama article

In [11]:
obama_word_count_table = pd.DataFrame.from_dict(obama[["word_count"]].iloc[0][0], orient='index')
obama_word_count_table = obama_word_count_table.reset_index()
obama_word_count_table.columns = ['word', "count"]
obama_word_count_table = obama_word_count_table.sort_values("count", ascending=False)
obama_word_count_table.head(5)

Unnamed: 0,word,count
12,the,40
26,in,30
14,and,21
17,of,18
23,to,14


### Compute TF-IDF for the corpus

In [12]:
people["word_count"] = people["text"].apply(word_count)
people.head(5)

Unnamed: 0,URI,name,text,word_count
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'digby': 1, 'morrell': 5, 'born': 1, '10': 1,..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'alfred': 1, 'j': 1, 'lewy': 3, 'aka': 1, 'sa..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'harpdog': 2, 'brown': 2, 'is': 7, 'a': 7, 's..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'franz': 1, 'rottensteiner': 3, 'born': 1, 'i..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'henry': 1, 'krvits': 1, 'born': 1, '30': 1, ..."


In [13]:

clooney = people[people["name"] == "George Clooney"]

clooney_word_count_table = pd.DataFrame.from_dict(clooney[["word_count"]].iloc[0][0], orient='index')
clooney_word_count_table = clooney_word_count_table.reset_index()
clooney_word_count_table.columns = ['word', "count"]
clooney_word_count_table = clooney_word_count_table.sort_values("count", ascending=False)
clooney_word_count_table.head(5)



Unnamed: 0,word,count
31,the,35
14,and,15
38,in,14
66,of,14
23,for,13


In [14]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

dv = DictVectorizer()
X = dv.fit_transform(people["word_count"])
tv = TfidfTransformer()
tfidf = tv.fit_transform(X)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [15]:
print(tfidf.shape)
print(len(people))

(59071, 548555)
59071


In [16]:
df = pd.DataFrame(tfidf.toarray())

In [17]:
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,548545,548546,548547,548548,548549,548550,548551,548552,548553,548554
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df["name"] = people["name"]

In [19]:
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,548546,548547,548548,548549,548550,548551,548552,548553,548554,name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Digby Morrell
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Alfred J. Lewy
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Harpdog Brown
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Franz Rottensteiner
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G-Enka


In [20]:
u1 = df[df["name"] == "Barack Obama"].iloc[:, 0:548555].values.tolist()[0]
u2 = df[df["name"] == "George Clooney"].iloc[:, 0:548555].values.tolist()[0]
u3 = df[df["name"] == "Bill Clinton"].iloc[:, 0:548555].values.tolist()[0]
#u4 = df[df["name"] == "David Beckhan"].iloc[:, 0:548555].values.tolist()[0]

print(type(u1))

<class 'list'>


### Manually compute distance between few people

In [21]:
print("Obama Vs Clinton : ", scipy.spatial.distance.cosine(u3, u1))

Obama Vs Clinton :  0.6738752906183083


# Build nearest neighbor model for documents retrieval

In [None]:
# Create the knn model.
# Look at the five closest neighbors.
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data.
knn.fit(df.iloc[:, 0:548555], df["name"])
# Make point predictions on the test set using the fit model.
predictions = knn.predict(df.iloc[:, 0:548555])


In [3]:
people

NameError: name 'people' is not defined