# Summarize K-Means (K=100) Cluster for full-bow

In [1]:
# modify these for your own computer
repo_directory = '/Users/Michael/Documents/GitHub/law-net/'

data_dir = '/Users/Michael/Desktop/network_data/'

import os
import numpy as np
import re
import sys
import matplotlib.pyplot as plt
import glob
import cPickle  as pickle
from collections import OrderedDict
import json


# graph package
import igraph as ig


# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import mutual_info_score as mi
from sklearn.metrics import adjusted_rand_score as ar
from sklearn.metrics import calinski_harabaz_score as ch # (X, labels)
from sklearn.metrics import completeness_score as cs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import fowlkes_mallows_score as fm
from sklearn.metrics import homogeneity_completeness_v_measure as hcvm
from sklearn.metrics import homogeneity_score as hs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import silhouette_score as ss # (X, labels)
from sklearn.metrics import silhouette_samples as ss2 # (X, labels)
from sklearn.metrics import v_measure_score as vm

import scipy.sparse
import random
import itertools
from itertools import combinations


# our code
sys.path.append(repo_directory + 'code/')
from summarize_clusters import *
from helpful_functions import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from bag_of_words import * 

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
nlp_sub_dir = nlp_dir + 'bow_tfidf/' #tfidf matrix (and other info, i.e. vocab) computed from bag-of-words matrix
nlp_bow_dir = nlp_dir + 'bow/' #bag-of-words matrix (and other info, i.e. vocab)
nlp_df_sub_dir = nlp_dir + 'bow_tfidf_df/'

# csv location
csv_dir = "C:/Users/Michael/Documents/GitHub/law-net/csv/"
csv_dir_mod = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_modularity/"
csv_dir_walk = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_walktrap/"
csv_dir_full_tfidf_km10 = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_full_tfidf_km10/"
csv_dir_full_tfidf_km100 = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_full_tfidf_km100/"
csv_dir_full_bow_km10 = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_full_bow_km10/"
csv_dir_full_bow_km100 = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_full_bow_km100/"


# all the file paths for .txt files
file_paths = glob.glob(text_dir + '*.txt')

# all opinions
all_the_opinions = all_opinions(file_paths)

# clusters directory
clusters_dir = "C:/Users/Michael/Desktop/network_data/raw/scotus/clusters/"

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

## load bow vectors
**bow_matrix** = (row_index, column_index): token/word-count value (**CSR FORMAT**)  
**op_id_to_bow_id** = opinion_id (corresponds to row indices)  
**vocab** = all the words in bow_matrix (correspond to column indices)

In [2]:
def load_bow(nlp_dir):
    """
    bow_matrix, op_id_to_bow_id = load_bow(nlp_dir)
    """
    bow_matrix = load_sparse_csr(nlp_dir + 'bag_of_words_matrix.npz')

    with open(nlp_dir + 'op_id_to_bow_id.p', 'rb') as f:
        op_id_to_bow_id = pickle.load(f)

    with open(nlp_dir + 'vocab.p', 'rb') as f:
        vocab = pickle.load(f)

    return bow_matrix, op_id_to_bow_id, vocab

In [3]:
bow_matrix, op_id_to_bow_id, vocab = load_bow(nlp_bow_dir)

In [4]:
bow_matrix

<27885x567570 sparse matrix of type '<type 'numpy.int64'>'
	with 20817470 stored elements in Compressed Sparse Row format>

In [5]:
clusters = pd.read_csv(csv_dir + 'clusters_full_bow.csv')
clusters.head()

Unnamed: 0.1,Unnamed: 0,mod,wt,km_10,km_100
0,145658,1,5,0,32
1,89370,3,294,0,32
2,89371,0,35,0,32
3,89372,0,3,0,32
4,89373,0,3,0,32


In [6]:
nlp_tfidf_clusters = clusters['km_100'].tolist()

nlp_clusters = pd.Series(nlp_tfidf_clusters, index=op_id_to_bow_id)
nlp_clusters.head()

145658    32
89370     32
89371     32
89372     32
89373     32
dtype: int64

## get the top 5 biggest clusters

In [7]:
'''
dict_top_n_clusters = dictionary of top K clusters 
                      (key=cluster #, value=opinions in cluster)
                      
biggest_n_clusters = list of top K clusters (int)
'''

dict_top_n_clusters, biggest_n_clusters = get_top_n_clusters(5, 100, nlp_clusters)

cluster 32 : 7802 opinions
cluster 20 : 5371 opinions
cluster 0 : 2147 opinions
cluster 16 : 1553 opinions
cluster 63 : 1249 opinions


# Top K Words of Each Cluster
This function summarizes a set of opinions by returning the words that appear in these opinions with the highest tf-idf scores.

# Top K Words ($\mu_{cluster}$) of Each Cluster
compute the mean tf-idf vector of the cluster, return the top K words from this mean vector

# Top K Words ($\mu_{cluster} - \mu_{complement}$ ) of Each Cluster
compute the mean tf-idf vector of the cluster and also of the complement of the cluster,  
take the difference mu_cluster - mu_complement, return the top K words in this difference

# Most Relevant Opinion of Each Cluster
compute the mean tf-idf vector, return the document in the cluster closet to the mean  

# Cluster 32 Summary (7802 opinions)

In [8]:
%%time

opinion_names = []
opinion_dates = []
opinion_links = []

for i in dict_top_n_clusters[32]:
    try:
        with open(clusters_dir + i + ".json") as data_file:
            data = json.load(data_file)
    except IOError:
        pass
        #name, date, link = case_info2(i)
        #opinion_names.append(name)
        #opinion_dates.append(date)
        #opinion_links.append(link)
        
    name = data['case_name'].encode('utf-8')
    date = data['date_filed'].encode('utf-8')
    link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')
    
    opinion_names.append(name)
    opinion_dates.append(date)
    opinion_links.append(link)
    
cluster_info = pd.DataFrame()
cluster_info['names'] = opinion_names
cluster_info['dates'] = opinion_dates
cluster_info['url'] = opinion_links

cluster_info.to_csv(csv_dir_full_bow_km100 + "cluster_32.csv")

print cluster_info.shape

(7802, 3)
Wall time: 3.42 s


In [9]:
cluster_info = pd.read_csv(csv_dir_full_bow_km100 + 'cluster_32.csv')
cluster_info.head()

Unnamed: 0.1,Unnamed: 0,names,dates,url
0,0,DaimlerChrysler Corp. v. Cuno,2006-05-15,https://www.courtlistener.com/opinion/145658/d...
1,1,Garfielde v. United States,1876-05-18,https://www.courtlistener.com/opinion/89370/ga...
2,2,Whiteside v. United States,1876-11-27,https://www.courtlistener.com/opinion/89371/wh...
3,3,Barkley v. Levee Commissioners,1876-12-18,https://www.courtlistener.com/opinion/89372/ba...
4,4,Broughton v. Pensacola,1876-12-18,https://www.courtlistener.com/opinion/89373/br...


In [10]:
%%time

k=1000 # number of words to get

top_words = top_k_words(dict_top_n_clusters[32], k, bow_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[32], k, bow_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[32], all_the_opinions, k, bow_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[32], bow_matrix, op_id_to_bow_id)

top_words = [x.encode('utf-8') for x in top_words]
top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

#print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', top_words
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', top_words_from_mean
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', top_words_from_diff
#print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op

cluster_summary = pd.DataFrame()
cluster_summary['top_words'] = top_words
cluster_summary['top_words_from_mean'] = top_words_from_mean
cluster_summary['top_words_from_diff'] = top_words_from_diff
cluster_summary['most_relev_op'] = most_relev_op

cluster_summary.to_csv(csv_dir_full_bow_km100 + "cluster_32_summary.csv")

print cluster_summary.shape

(1000, 4)
Wall time: 1min 1s


In [11]:
cluster_summary = pd.read_csv(csv_dir_full_bow_km100 + 'cluster_32_summary.csv')
cluster_summary.head()

Unnamed: 0.1,Unnamed: 0,top_words,top_words_from_mean,top_words_from_diff,most_relev_op
0,0,feet,court,feet,99961
1,1,state,state,insur,99961
2,2,e,v,rate,99961
3,3,n,case,price,99961
4,4,w,act,compani,99961


# Cluster 20 Summary (5371 opinions)

In [12]:
%%time

opinion_names = []
opinion_dates = []
opinion_links = []

for i in dict_top_n_clusters[20]:
    try:
        with open(clusters_dir + i + ".json") as data_file:
            data = json.load(data_file)
    except IOError:
        pass
        #name, date, link = case_info2(i)
        #opinion_names.append(name)
        #opinion_dates.append(date)
        #opinion_links.append(link)
        
    name = data['case_name'].encode('utf-8')
    date = data['date_filed'].encode('utf-8')
    link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')
    
    opinion_names.append(name)
    opinion_dates.append(date)
    opinion_links.append(link)
    
cluster_info = pd.DataFrame()
cluster_info['names'] = opinion_names
cluster_info['dates'] = opinion_dates
cluster_info['url'] = opinion_links

cluster_info.to_csv(csv_dir_full_bow_km100 + "cluster_20.csv")

print cluster_info.shape

(5371, 3)
Wall time: 2.55 s


In [13]:
cluster_info = pd.read_csv(csv_dir_full_bow_km100 + 'cluster_20.csv')
cluster_info.head()

Unnamed: 0.1,Unnamed: 0,names,dates,url
0,0,Windsor v. McVeigh,1876-12-11,https://www.courtlistener.com/opinion/89375/wi...
1,1,Indianapolis & St. Louis R. Co. v. Horst,1876-12-18,https://www.courtlistener.com/opinion/89378/in...
2,2,Wilson v. Daniel,1798-08-17,https://www.courtlistener.com/opinion/2224796/...
3,3,Baltimore & Ohio R. Co. v. Kepner,1941-11-10,https://www.courtlistener.com/opinion/103549/b...
4,4,Unio Pac. R. Co. v. United States,1941-10-13,https://www.courtlistener.com/opinion/103541/u...


In [14]:
%%time

k=1000 # number of words to get

top_words = top_k_words(dict_top_n_clusters[20], k, bow_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[20], k, bow_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[20], all_the_opinions, k, bow_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[20], bow_matrix, op_id_to_bow_id)

top_words = [x.encode('utf-8') for x in top_words]
top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

#print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', top_words
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', top_words_from_mean
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', top_words_from_diff
#print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op

cluster_summary = pd.DataFrame()
cluster_summary['top_words'] = top_words
cluster_summary['top_words_from_mean'] = top_words_from_mean
cluster_summary['top_words_from_diff'] = top_words_from_diff
cluster_summary['most_relev_op'] = most_relev_op

cluster_summary.to_csv(csv_dir_full_bow_km100 + "cluster_20_summary.csv")

print cluster_summary.shape

(1000, 4)
Wall time: 44.5 s


In [15]:
cluster_summary = pd.read_csv(csv_dir_full_bow_km100 + 'cluster_20_summary.csv')
cluster_summary.head()

Unnamed: 0.1,Unnamed: 0,top_words,top_words_from_mean,top_words_from_diff,most_relev_op
0,0,law,court,offic,102039
1,1,punish,state,law,102039
2,2,contract,v,execut,102039
3,3,state,case,court,102039
4,4,sentenc,act,v,102039


# Cluster 0 Summary (2147 opinions)

In [16]:
%%time

opinion_names = []
opinion_dates = []
opinion_links = []

for i in dict_top_n_clusters[0]:
    try:
        with open(clusters_dir + i + ".json") as data_file:
            data = json.load(data_file)
    except IOError:
        pass
        #name, date, link = case_info2(i)
        #opinion_names.append(name)
        #opinion_dates.append(date)
        #opinion_links.append(link)
        
    name = data['case_name'].encode('utf-8')
    date = data['date_filed'].encode('utf-8')
    link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')
    
    opinion_names.append(name)
    opinion_dates.append(date)
    opinion_links.append(link)
    
cluster_info = pd.DataFrame()
cluster_info['names'] = opinion_names
cluster_info['dates'] = opinion_dates
cluster_info['url'] = opinion_links

cluster_info.to_csv(csv_dir_full_bow_km100 + "cluster_0.csv")

print cluster_info.shape

(2147, 3)
Wall time: 827 ms


In [17]:
cluster_info = pd.read_csv(csv_dir_full_bow_km100 + 'cluster_0.csv')
cluster_info.head()

Unnamed: 0.1,Unnamed: 0,names,dates,url
0,0,Strickler v. Greene,1999-06-17,https://www.courtlistener.com/opinion/118307/s...
1,1,National Bank v. Kimball,1881-03-30,https://www.courtlistener.com/opinion/90393/na...
2,2,Sherman v. United States,1895-01-14,https://www.courtlistener.com/opinion/94076/sh...
3,3,"US Airways, Inc. v. McCutchen",2013-04-16,https://www.courtlistener.com/opinion/858085/u...
4,4,Massachusetts v. Missouri,1939-11-06,https://www.courtlistener.com/opinion/103229/m...


In [18]:
%%time

k=1000 # number of words to get

top_words = top_k_words(dict_top_n_clusters[0], k, bow_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[0], k, bow_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[0], all_the_opinions, k, bow_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[0], bow_matrix, op_id_to_bow_id)

top_words = [x.encode('utf-8') for x in top_words]
top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

#print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', top_words
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', top_words_from_mean
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', top_words_from_diff
#print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op

cluster_summary = pd.DataFrame()
cluster_summary['top_words'] = top_words
cluster_summary['top_words_from_mean'] = top_words_from_mean
cluster_summary['top_words_from_diff'] = top_words_from_diff
cluster_summary['most_relev_op'] = most_relev_op

cluster_summary.to_csv(csv_dir_full_bow_km100 + "cluster_0_summary.csv")

print cluster_summary.shape

(1000, 4)
Wall time: 19.5 s


In [19]:
cluster_summary = pd.read_csv(csv_dir_full_bow_km100 + 'cluster_0_summary.csv')
cluster_summary.head()

Unnamed: 0.1,Unnamed: 0,top_words,top_words_from_mean,top_words_from_diff,most_relev_op
0,0,state,court,court,104977
1,1,court,state,state,104977
2,2,law,v,unit,104977
3,3,school,case,u,104977
4,4,power,act,act,104977


# Cluster 16 Summary (1553 opinions)

In [20]:
%%time

opinion_names = []
opinion_dates = []
opinion_links = []

for i in dict_top_n_clusters[16]:
    try:
        with open(clusters_dir + i + ".json") as data_file:
            data = json.load(data_file)
    except IOError:
        pass
        #name, date, link = case_info2(i)
        #opinion_names.append(name)
        #opinion_dates.append(date)
        #opinion_links.append(link)
        
    name = data['case_name'].encode('utf-8')
    date = data['date_filed'].encode('utf-8')
    link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')
    
    opinion_names.append(name)
    opinion_dates.append(date)
    opinion_links.append(link)
    
cluster_info = pd.DataFrame()
cluster_info['names'] = opinion_names
cluster_info['dates'] = opinion_dates
cluster_info['url'] = opinion_links

cluster_info.to_csv(csv_dir_full_bow_km100 + "cluster_16.csv")

print cluster_info.shape

(1553, 3)
Wall time: 383 ms


In [21]:
cluster_info = pd.read_csv(csv_dir_full_bow_km100 + 'cluster_16.csv')
cluster_info.head()

Unnamed: 0.1,Unnamed: 0,names,dates,url
0,0,Pullman Co. v. Kansas Ex Rel. Coleman,1910-01-31,https://www.courtlistener.com/opinion/97151/pu...
1,1,Smietanka v. Indiana Steel Co.,1921-10-24,https://www.courtlistener.com/opinion/99850/sm...
2,2,Randon v. Toby,1851-03-18,https://www.courtlistener.com/opinion/86657/ra...
3,3,Van Buren v. Digges,1851-03-11,https://www.courtlistener.com/opinion/86655/va...
4,4,Prewit v. Wilson,1881-02-28,https://www.courtlistener.com/opinion/90292/pr...


In [22]:
%%time

k=1000 # number of words to get

top_words = top_k_words(dict_top_n_clusters[16], k, bow_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[16], k, bow_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[16], all_the_opinions, k, bow_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[16], bow_matrix, op_id_to_bow_id)

top_words = [x.encode('utf-8') for x in top_words]
top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

#print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', top_words
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', top_words_from_mean
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', top_words_from_diff
#print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op

cluster_summary = pd.DataFrame()
cluster_summary['top_words'] = top_words
cluster_summary['top_words_from_mean'] = top_words_from_mean
cluster_summary['top_words_from_diff'] = top_words_from_diff
cluster_summary['most_relev_op'] = most_relev_op

cluster_summary.to_csv(csv_dir_full_bow_km100 + "cluster_16_summary.csv")

print cluster_summary.shape

(1000, 4)
Wall time: 15.8 s


In [23]:
cluster_summary = pd.read_csv(csv_dir_full_bow_km100 + 'cluster_16_summary.csv')
cluster_summary.head()

Unnamed: 0.1,Unnamed: 0,top_words,top_words_from_mean,top_words_from_diff,most_relev_op
0,0,§,court,feder,87704
1,1,state,state,case,87704
2,2,candid,v,bill,87704
3,3,parti,case,constitut,87704
4,4,bill,act,contribut,87704


# Cluster 63 Summary (1249 opinions)

In [24]:
%%time

opinion_names = []
opinion_dates = []
opinion_links = []

for i in dict_top_n_clusters[63]:
    try:
        with open(clusters_dir + i + ".json") as data_file:
            data = json.load(data_file)
    except IOError:
        pass
        #name, date, link = case_info2(i)
        #opinion_names.append(name)
        #opinion_dates.append(date)
        #opinion_links.append(link)
        
    name = data['case_name'].encode('utf-8')
    date = data['date_filed'].encode('utf-8')
    link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')
    
    opinion_names.append(name)
    opinion_dates.append(date)
    opinion_links.append(link)
    
cluster_info = pd.DataFrame()
cluster_info['names'] = opinion_names
cluster_info['dates'] = opinion_dates
cluster_info['url'] = opinion_links

cluster_info.to_csv(csv_dir_full_bow_km100 + "cluster_63.csv")

print cluster_info.shape

(1249, 3)
Wall time: 259 ms


In [25]:
cluster_info = pd.read_csv(csv_dir_full_bow_km100 + 'cluster_63.csv')
cluster_info.head()

Unnamed: 0.1,Unnamed: 0,names,dates,url
0,0,Oklahoma Ex Rel. Phillips v. Guy F. Atkinson Co.,1941-06-02,https://www.courtlistener.com/opinion/103544/o...
1,1,Ex Parte Harley-Davidson Motor Co.,1922-06-05,https://www.courtlistener.com/opinion/100019/e...
2,2,Schaefer v. Werling,1903-02-23,https://www.courtlistener.com/opinion/95799/sc...
3,3,"National Safe Deposit, Sav. & Trust Co. of DC ...",1913-06-10,https://www.courtlistener.com/opinion/97930/na...
4,4,Hendy v. Golden State & Miners' Iron Works,1888-05-14,https://www.courtlistener.com/opinion/92260/he...


In [26]:
%%time

k=1000 # number of words to get

top_words = top_k_words(dict_top_n_clusters[63], k, bow_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[63], k, bow_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[63], all_the_opinions, k, bow_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[63], bow_matrix, op_id_to_bow_id)

top_words = [x.encode('utf-8') for x in top_words]
top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

#print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', top_words
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', top_words_from_mean
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', top_words_from_diff
#print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op

cluster_summary = pd.DataFrame()
cluster_summary['top_words'] = top_words
cluster_summary['top_words_from_mean'] = top_words_from_mean
cluster_summary['top_words_from_diff'] = top_words_from_diff
cluster_summary['most_relev_op'] = most_relev_op

cluster_summary.to_csv(csv_dir_full_bow_km100 + "cluster_63_summary.csv")

print cluster_summary.shape

(1000, 4)
Wall time: 13.4 s


In [27]:
cluster_summary = pd.read_csv(csv_dir_full_bow_km100 + 'cluster_63_summary.csv')
cluster_summary.head()

Unnamed: 0.1,Unnamed: 0,top_words,top_words_from_mean,top_words_from_diff,most_relev_op
0,0,school,court,said,1087734
1,1,religi,state,school,1087734
2,2,aid,v,compani,1087734
3,3,racial,case,bond,1087734
4,4,sentenc,act,religi,1087734
