In [182]:
# Image retrieval using deep features
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.cluster import KMeans

In [114]:
# load the dataset for people wiki
people = pd.read_csv('people_wiki.csv')

In [115]:
# print people dataset
people

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Choc\'late_Allen>,Choc\'late Allen,choclate allen born june 19 1993 is a child ac...
1,<http://dbpedia.org/resource/Nikki_Walker>,Nikki Walker,nikki walker born 5 march 1982 in aberdeen is ...
2,<http://dbpedia.org/resource/Philip_Haas>,Philip Haas,philip haas born 1954 is an american artist sc...
3,<http://dbpedia.org/resource/Kevin_Rubio>,Kevin Rubio,kevin rubio born december 20 1967 is an americ...
4,<http://dbpedia.org/resource/Fran%C3%A7ois-Xav...,Fran%C3%A7ois-Xavier Roth,franoisxavier roth born 6 november 1971 paris ...
...,...,...,...
1465,<http://dbpedia.org/resource/Olari_Elts>,Olari Elts,olari elts born april 27 1971 in tallinn eston...
1466,<http://dbpedia.org/resource/Scott_F._Crago>,Scott F. Crago,scott francis crago born july 26 1963 twin bro...
1467,<http://dbpedia.org/resource/David_Cass_(footb...,David Cass (footballer),david william royce cass born 27 march 1962 in...
1468,<http://dbpedia.org/resource/Keith_Elias>,Keith Elias,keith hector elias born february 3 1972 in lac...


In [116]:
### Explore data
## Taking a look at the entry for President Obama
rubio = people[people['name'] == 'Kevin Rubio']

In [117]:
# Print record
rubio

Unnamed: 0,URI,name,text
3,<http://dbpedia.org/resource/Kevin_Rubio>,Kevin Rubio,kevin rubio born december 20 1967 is an americ...


In [118]:
# print out the text variable
print(rubio['text'])

3    kevin rubio born december 20 1967 is an americ...
Name: text, dtype: object


In [119]:
# Create document Array 
documents = people['text']

In [120]:
# Write function to vectorize the word counts into a data frame
def MakeVector(doc_array):
    vectorizer = CountVectorizer()
    word_matrix = vectorizer.fit_transform(doc_array)
    word_matrix = word_matrix.toarray()
    word_names = vectorizer.get_feature_names()
    dataset = pd.DataFrame(word_matrix, columns = word_names)
    return dataset

In [121]:
# apply function and save to a variable
words_vector = MakeVector(documents)
word_columns = words_vector.columns

In [122]:
# Use the TD IDF algorithm to create normalized word frequency matrices
tfidfer = TfidfTransformer()
tfidf_maker = tfidfer.fit_transform(words_vector)
tfidf_matrix = tfidf_maker.toarray()
dataset = pd.DataFrame(tfidf_matrix, columns = word_columns)

In [123]:
# Get the shape of the dataset
dataset.shape

(1470, 44027)

In [126]:
# Create a processed dataset with the name and text of the person
people = people.rename(columns = {'name': 'Person_name', 'text': 'Person_text'})
proc_datasets = pd.concat([people[['Person_name', 'Person_text']], dataset], axis = 1)

In [199]:
# print the document
cluster = proc_datasets.columns[2:]

In [202]:
cluster_set = proc_datasets[cluster]

In [283]:
### Apply nearest neighbors for retrieval of Wikipedia articles
## Build the NN model
# create nearest neighbor model
k_means = KMeans(n_clusters=15)
k_means_clusters = k_means.fit(cluster_set)

In [291]:
# Create cluster variable
proc_datasets['cluster'] = k_means_clusters.labels_

In [292]:
# Create dataset search
labeled_data = proc_datasets[['Person_name', 'Person_text', 'cluster']]

In [303]:
# create person search and get filtered dataset
person_search = input("Paste Person's Name:").title()
clust_num = int(labeled_data[labeled_data['Person_name'] == person_search]["cluster"])
print('Here is the cluster number:', clust_num)
search_select = labeled_data[labeled_data['cluster'] == clust_num].reset_index()

Paste Person's Name:Olari Elts
Here is the cluster number: 4


In [305]:
# print dataset
search_select

Unnamed: 0,index,Person_name,Person_text,cluster
0,4,Fran%C3%A7ois-Xavier Roth,franoisxavier roth born 6 november 1971 paris ...,4
1,17,Frances Yip,frances yip laiyee born 1947 is a hong kong ca...,4
2,76,Robert Orledge,robert orledge is a leading scholar of early t...,4
3,77,Stefan Szkafarowsky,stefan szkafarowsky is an opera singerthe 2013...,4
4,78,Arthur Kisenyi,arthur kisenyi born 1990 is a ugandan actor si...,4
5,86,Antti Siirala,antti siirala born 16 may 1979 in helsinki is ...,4
6,113,Lev Vinocour,lev vinocour born 1970 is a pianist from russi...,4
7,163,John Simon (composer),john simon 12 march 1944 in cape town is a sou...,4
8,226,Caroline Kang,caroline kang new york 1980 is an american con...,4
9,311,Walter Prystawski,walter prystawski cm born 1933 is a canadian v...,4
