## Import necessary packages

In [4]:
## Import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from sklearn.mixture import BayesianGaussianMixture

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from scipy.stats import multivariate_normal

## Load Wikipedia data and extract TF-IDF features

In [5]:
wiki = pd.read_csv('people_wiki.csv')

In [10]:
wiki = wiki.rename(columns = {'name':'Person_name', 'text':'Person_text'})
documents = wiki['Person_text']

In [11]:
documents

0       choclate allen born june 19 1993 is a child ac...
1       nikki walker born 5 march 1982 in aberdeen is ...
2       philip haas born 1954 is an american artist sc...
3       kevin rubio born december 20 1967 is an americ...
4       franoisxavier roth born 6 november 1971 paris ...
                              ...                        
1465    olari elts born april 27 1971 in tallinn eston...
1466    scott francis crago born july 26 1963 twin bro...
1467    david william royce cass born 27 march 1962 in...
1468    keith hector elias born february 3 1972 in lac...
1469    fawaz mohammed damrah arabic fawwz damra was t...
Name: Person_text, Length: 1470, dtype: object

In [12]:
# Create Count Vectorizer

## Create Vectorizer object
vectorizer = CountVectorizer()

# create vector and convert it into a Matrix
feature_obj = vectorizer.fit_transform(documents)
doc_matrix = feature_obj.toarray()

# create word feature matrix
doc_matrix = normalize(doc_matrix)
doc_features = pd.DataFrame(doc_matrix, columns = vectorizer.get_feature_names())

As in the previous assignment, we will normalize each document's TF-IDF vector to be a unit vector. 

In [13]:
# Code used to create a smaller sample document martix for model building efficieny
np.random.seed(1)
random_columns = np.random.sample(len(doc_features.columns))
columns_random = np.array([np.array(doc_features.columns), random_columns])
select_vars = columns_random[1] <= 0.02
keep_vars = columns_random[0][select_vars]

sample_doc_features = doc_features[keep_vars]

In [23]:
# Now we Create a Bayesian Gaussian Mixture Model and evaluate the results for product popularity
bguassian_mixture_model = BayesianGaussianMixture(n_components=12, max_iter=50, verbose = 1, init_params ='random', tol = 0.01)
wiki_doc_mixture_model = bguassian_mixture_model.fit(sample_doc_features)

Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
  Iteration 50
Initialization converged: False




In [24]:
# Create a prediction for popularity
predicted_clust = wiki_doc_mixture_model.predict(sample_doc_features)
wiki['predicted_clust'] = predicted_clust

In [26]:
wiki = wiki[['Person_name', 'Person_text', 'predicted_clust']]

In [31]:
# create person search and get filtered dataset
person_search = input("Paste Person's Name:").title()
clust_num = int(wiki[wiki['Person_name'] == person_search]["predicted_clust"])
print('Here is the cluster number:', clust_num)
search_select = wiki[wiki['predicted_clust'] == clust_num].reset_index()

Paste Person's Name:rachel stevens
Here is the cluster number: 8


In [32]:
search_select

Unnamed: 0,index,Person_name,Person_text,predicted_clust
0,2,Philip Haas,philip haas born 1954 is an american artist sc...,8
1,3,Kevin Rubio,kevin rubio born december 20 1967 is an americ...,8
2,13,Rachel Stevens,rachel lauren stevens born 9 april 1978 is an ...,8
3,31,Audu Idris Umar,audu idris umar born 28 december 1959 was elec...,8
4,46,Nicholas Lorusso,nicholas joseph lorusso also known as nick lor...,8
...,...,...,...,...
82,1336,Marco Antonio Zago,marco antonio zago b january 11 1946 birigi so...,8
83,1397,J%C3%B8rgen Ingmann,jrgen ingmann born 26 april 1925 is a musician...,8
84,1398,Nicholas Rees,nicholas rees born february 17 1982 in nassau ...,8
85,1400,Ron C. Bigelow,ron c bigelow born 1948 is the eighth mayor of...,8
