# Hierarchical Clustering

In [1]:
import pandas as pd                                     # see below for install instruction
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans                # we'll be using scikit-learn's KMeans for this assignment
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
%matplotlib inline

In [3]:
wiki = pd.read_csv('people_wiki.csv')

In [4]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix( (data, indices, indptr), shape)

tf_idf = load_sparse_csr('people_wiki_tf_idf.npz')
map_index_to_word = pd.read_json('people_wiki_map_index_to_word.json',typ='series')

In [5]:
tf_idf = normalize(tf_idf)

In [7]:
def bipartition(cluster, maxiter=400, num_runs=4, seed=None):
    '''cluster: should be a dictionary containing the following keys
                * dataframe: original dataframe
                * matrix:    same data, in matrix format
                * centroid:  centroid for this particular cluster'''
    
    data_matrix = cluster['matrix']
    dataframe   = cluster['dataframe']
    
    # Run k-means on the data matrix with k=2. We use scikit-learn here to simplify workflow.
    kmeans_model = KMeans(n_clusters=2, max_iter=maxiter, n_init=num_runs, random_state=seed, n_jobs=-1,verbose=1)
    kmeans_model.fit(data_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
    
    # Divide the data matrix into two parts using the cluster assignments.
    data_matrix_left_child, data_matrix_right_child = data_matrix[cluster_assignment==0], \
                                                      data_matrix[cluster_assignment==1]
    
    # Divide the dataframe into two parts, again using the cluster assignments.
    cluster_assignment_sa = np.array(cluster_assignment) # minor format conversion
    dataframe_left_child, dataframe_right_child     = dataframe[cluster_assignment_sa==0], \
                                                      dataframe[cluster_assignment_sa==1]
        
    
    # Package relevant variables for the child clusters
    cluster_left_child  = {'matrix': data_matrix_left_child,
                           'dataframe': dataframe_left_child,
                           'centroid': centroids[0]}
    cluster_right_child = {'matrix': data_matrix_right_child,
                           'dataframe': dataframe_right_child,
                           'centroid': centroids[1]}
    
    return (cluster_left_child, cluster_right_child)

In [8]:
wiki_data = {'matrix': tf_idf, 'dataframe': wiki} # no 'centroid' for the root cluster
left_child, right_child = bipartition(wiki_data, maxiter=100, num_runs=6, seed=1)

Initialization complete
Iteration  0, inertia 116403.036
Iteration  1, inertia 58351.852
Iteration  2, inertia 58297.635
Iteration  3, inertia 58261.706
Iteration  4, inertia 58229.069
Iteration  5, inertia 58220.610
Iteration  6, inertia 58219.679
Iteration  7, inertia 58219.221
Iteration  8, inertia 58218.996
Iteration  9, inertia 58218.868
Iteration 10, inertia 58218.790
Iteration 11, inertia 58218.744
Iteration 12, inertia 58218.713
Iteration 13, inertia 58218.688
Iteration 14, inertia 58218.672
Iteration 15, inertia 58218.659
Iteration 16, inertia 58218.653
Iteration 17, inertia 58218.649
Iteration 18, inertia 58218.647
Iteration 19, inertia 58218.645
Iteration 20, inertia 58218.644
Iteration 21, inertia 58218.643
Iteration 22, inertia 58218.643
Iteration 23, inertia 58218.642
Iteration 24, inertia 58218.642
Converged at iteration 24: center shift 0.000000e+00 within tolerance 1.803739e-10
Initialization complete
Iteration  0, inertia 116703.253
Iteration  1, inertia 58287.513
Ite

Iteration  5, inertia 58184.547
Iteration  6, inertia 58179.865
Iteration  7, inertia 58179.673
Iteration  8, inertia 58179.617
Iteration  9, inertia 58179.596
Iteration 10, inertia 58179.586
Iteration 11, inertia 58179.582
Iteration 12, inertia 58179.580
Iteration 13, inertia 58179.578
Iteration 14, inertia 58179.577
Iteration 15, inertia 58179.576
Iteration 16, inertia 58179.575
Iteration 17, inertia 58179.575
Iteration 18, inertia 58179.575
Converged at iteration 18: center shift 0.000000e+00 within tolerance 1.803739e-10


In [9]:
left_child

{'matrix': <11510x547979 sparse matrix of type '<class 'numpy.float64'>'
 	with 1885831 stored elements in Compressed Sparse Row format>,
 'dataframe':                                                      URI  \
 0            <http://dbpedia.org/resource/Digby_Morrell>   
 17     <http://dbpedia.org/resource/Paddy_Dunne_(Gael...   
 21           <http://dbpedia.org/resource/Ceiron_Thomas>   
 22            <http://dbpedia.org/resource/Adel_Sellimi>   
 25             <http://dbpedia.org/resource/Vic_Stasiuk>   
 28            <http://dbpedia.org/resource/Leon_Hapgood>   
 30               <http://dbpedia.org/resource/Dom_Flora>   
 33               <http://dbpedia.org/resource/Bob_Reece>   
 41     <http://dbpedia.org/resource/Bob_Adams_(Americ...   
 48              <http://dbpedia.org/resource/Marc_Logan>   
 49          <http://dbpedia.org/resource/Corey_Woolfolk>   
 63              <http://dbpedia.org/resource/Alan_Roper>   
 75      <http://dbpedia.org/resource/Vladimir_Yurchenko

In [10]:
right_child

{'matrix': <47561x547979 sparse matrix of type '<class 'numpy.float64'>'
 	with 8493452 stored elements in Compressed Sparse Row format>,
 'dataframe':                                                      URI  \
 1           <http://dbpedia.org/resource/Alfred_J._Lewy>   
 2            <http://dbpedia.org/resource/Harpdog_Brown>   
 3      <http://dbpedia.org/resource/Franz_Rottensteiner>   
 4                   <http://dbpedia.org/resource/G-Enka>   
 5            <http://dbpedia.org/resource/Sam_Henderson>   
 6            <http://dbpedia.org/resource/Aaron_LaCrate>   
 7          <http://dbpedia.org/resource/Trevor_Ferguson>   
 8             <http://dbpedia.org/resource/Grant_Nelson>   
 9             <http://dbpedia.org/resource/Cathy_Caruth>   
 10            <http://dbpedia.org/resource/Sophie_Crumb>   
 11           <http://dbpedia.org/resource/Jenn_Ashworth>   
 12        <http://dbpedia.org/resource/Jonathan_Hoefler>   
 13     <http://dbpedia.org/resource/Anthony_Gueterboc..