In [19]:
# modify these for your own computer
#repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
repo_directory = '/Users/Michael/Documents/GitHub/law-net/'

#data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'
data_dir = '/Users/Michael/Desktop/network_data/'

import os
import numpy as np
import sys
import matplotlib.pyplot as plt



# graph package
import igraph as ig


# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import CountVectorizer

import scipy.sparse
import random
import itertools


# our code
sys.path.append(repo_directory + 'code/')
from summarize_clusters import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
#from bag_of_words import load_tf_idf
from bag_of_words import * 

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
nlp_sub_dir = nlp_dir + 'bow_tfidf/' #tfidf matrix (and other info, i.e. vocab) computed from bag-of-words matrix
nlp_bow_dir = nlp_dir + 'bow/' #bag-of-words matrix (and other info, i.e. vocab)

# all the file paths for .txt files
file_paths = glob.glob(text_dir + '*.txt')

# csv location
csv_dir = "C:/Users/Michael/Documents/GitHub/law-net/csv/"

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import pandas as pd
import re
import glob
import cPickle as pickle

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from text_normalization import *
from pipeline_helper_functions import save_sparse_csr, load_sparse_csr

## load tf-idf vector, bow vectors

In [4]:
tfidf_matrix, op_id_to_bow_id, vocab = load_tf_idf(nlp_sub_dir)

In [5]:
def load_bow(nlp_dir):
    """
    bow_matrix, op_id_to_bow_id = load_bow(nlp_dir)
    """
    bow_matrix = load_sparse_csr(nlp_dir + 'bag_of_words_matrix.npz')

    with open(nlp_dir + 'op_id_to_bow_id.p', 'rb') as f:
        op_id_to_bow_id = pickle.load(f)

    with open(nlp_dir + 'vocab.p', 'rb') as f:
        vocab = pickle.load(f)

    return bow_matrix, op_id_to_bow_id, vocab

In [6]:
bow_matrix, op_id_to_bow_id_2, vocab_2 = load_bow(nlp_bow_dir)

# Clustering Work

In [13]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

# largest connected component

restrict our attention to the largest connected componenet on the network. also we are missing some text files from 2016 so lets ignore 2016.

In [14]:
# limit ourselves to cases upto and including 2015 since we are missing some textfiles from 2016
G = G.subgraph(G.vs.select(year_le=2015))

# make graph undirected
Gud = G.copy()
Gud = Gud.as_undirected()

# get largest connected componenet
components = Gud.clusters(mode='STRONG')
g = components.subgraphs()[np.argmax(components.sizes())]

# CL ids of cases in largest connected component
CLids = g.vs['name']

# graph clustering

Do community detection on network

## modularity on undirected scotus

In [15]:
%%time 

# modularity clustering
cd_modularity = g.community_fastgreedy() # .as_clustering().membership

mod_clust = cd_modularity.as_clustering()

print mod_clust.summary()

Clustering with 24724 elements and 126 clusters
Wall time: 1min 53s


In [16]:
graph_clusters = pd.Series(mod_clust.membership, index=g.vs['name'])

### get a cluster of opinions (cluster 3 of modularity) to use for summarize cluster functions

In [17]:
cluster_3_mod = graph_clusters[graph_clusters == 3].index.tolist()
print "number of opinions in cluster 3 of modularity: ", len(cluster_3_mod)

number of opinions in cluster 3 of modularity:  1458


## walktrap on undirected scotus

In [80]:
%%time

# walktrap clustering
cd_walktrap = g.community_walktrap()

wt_clust = cd_walktrap.as_clustering()

print wt_clust.summary()

Clustering with 24724 elements and 2264 clusters
Wall time: 2min 50s


In [81]:
walktrap_clusters = pd.Series(wt_clust.membership, index=g.vs['name'])

# NLP Clustering

## K means clustering on tf-idf
**problem**: takes 1-3 hours for me...

In [None]:
%%time

# set number of clusters
num_clusters = 30

# run kmeans
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

nlp_tfidf_clusters = km.labels_.tolist()

## Gaussian-Mixture-Model (GMM) Clustering on tf-idf
**problem**: can't do it on sparse matrices  
**treatments/solutions**:  
1. remove words with really high or low document frequency (tweak min_df and max_df when making bow or tfidf matrices)  
2. apply PCA to severely cut down dimension

In [None]:
%%time

# set number of clusters

# run GMM
gmm = GaussianMixture(n_components=1)
gmm.fit(tfidf_matrix)

gmm_clusters = gmm.labels_.tolist()

## Compare NLP clustering (tfidf) vs graph clustering

In [None]:
#clusters = pd.DataFrame(index=normalized_text_dict.keys(), columns=['nlp', 'graph'])
clusters = pd.DataFrame(index=op_id_to_bow_id, columns=['km', 'mod'])

# add in communities 
clusters['mod'] = graph_clusters

# consider nodes not considered in CD to be their own cluster
# i.e. nodes outside the largest connected component
clusters['mod'].fillna(max(graph_clusters) + 1, inplace=True)

# make formatting
clusters['mod'] = clusters['mod'].astype(np.int)

# add in NLP clusters
clusters['km'] = nlp_tfidf_clusters

# add in walktrap clusters
clusters['wt'] = walktrap_clusters
clusters['wt'].fillna(max(walktrap_clusters) + 1, inplace=True)
clusters['wt'] = clusters['wt'].astype(np.int)

clusters.to_csv(csv_dir + "clusters_full_tfidf.csv")

In [None]:
clusters

### cluster pandas dataframe saved in current directory

In [82]:
clusters = pd.read_csv(csv_dir + 'clusters_full_tfidf.csv')
clusters

Unnamed: 0.1,Unnamed: 0,km,mod,wt
0,145658,12,1,5
1,89370,12,3,294
2,89371,9,0,35
3,89372,12,0,3
4,89373,20,0,3
5,89374,2,2,4
6,89375,16,2,5
7,89376,25,2,6
8,89377,22,2,7
9,89378,14,2,7


# Summarize Cluster Function 1
This function summarizes a set of opinions by returning the words that appear in these opinions with the highest tf-idf scores.

In [67]:
def sort_coo(m):
    '''
    iterating through a csr (compressed sparse row) matrix:
    (row_index, column_index) tf_idf_value
    
    return a list of tuples (row, column, value), sorted by tf-idf values in descending order
    '''
    m = m.tocoo()
    list_of_tuples = []
    for i,j,k in zip(m.row, m.col, m.data):
        list_of_tuples.append((i,j,k)) # list of tuples
    return sorted(list_of_tuples, key=lambda x: x[2], reverse=True) # sort by tfidf values (descending)

def top_k_words(opinions, num_words, tfidf_matrix, op_id_to_bow_id, vocab):
    """
    This function summarizes a set of opinions by returning the words that appear in these opinions with the highest tf-idf scores.

    Parameters
    -----------
    opinions: list of opinion ids
    num_words: number of words to return as the summary
    tfidf_matrix: the tf-idf matrix of all SCOTUS opinions
    op_id_to_bow_id: dict that maps opinion ids to rows of the tfidf matrix

    Output
    -------
    a list of the words with highest tf-idf scores amount the given opinions
    """
    
    
    # op_id_to_bow_id['opinion_id'] = 'row_index'
    
    vocab = np.array(vocab)
    n = num_words
    row_indices = []
    
    # get row indices corresponding to the opinions
    for each_opinion in opinions:
        row_index = op_id_to_bow_id[each_opinion]
        row_indices.append(row_index)
    
    # construct matrix with rows (opinions) from cluster
    new_matrix = tfidf_matrix[row_indices, :]
    
    # return the matrix as sorted listed-of-tuples (descending sort by tf-idf values)
    sorted_matrix = sort_coo(new_matrix)
    
    # get the column indices
    column_ind = [x[1] for x in sorted_matrix]
    
    # get the words from column indices
    top_words = vocab[column_ind].tolist()[:n]
    return top_words

### run function

In [20]:
%%time
#opinions = ['1722', '1723', '1724']
top_words = top_k_words(cluster_3_mod, 20, tfidf_matrix, op_id_to_bow_id, vocab)
print top_words

[u'shaeffer', u'wool', u'carusi', u'toy', u'seed', u'jen', u'paper', u'renfrow', u'pearl', u'cork', u'tile', u'cadet', u'pardon', u'postmast', u'hilsman', u'ore', u'collector', u'nail', u'hoppl', u'frerich']
Wall time: 2.96 s


# Summarize Cluster Function 2
compute the mean tf-idf vector of the cluster, return the top K words from this mean vector

In [69]:
def top_k_words_from_mean_vector(opinions, num_words, tfidf_matrix, op_id_to_bow_id, vocab):
    '''
    compute the mean tf-idf vector of the cluster, return the top K words from this mean vector
    '''
    
    # op_id_to_bow_id['opinion_id'] = 'row_index'

    vocab = np.array(vocab)
    n = num_words
    row_indices = []
    
    # get row indices corresponding to the opinions
    for each_opinion in opinions:
        row_index = op_id_to_bow_id[each_opinion]
        row_indices.append(row_index)
    
    # construct a matrix with rows (opinions) from cluster
    new_matrix = tfidf_matrix[row_indices, :]
    
    # to take the mean of each col (use axis=1 to take mean of each row)
    mean_matrix = new_matrix.mean(axis=0) # 1 X 567570 row matrix 
    
    # get the column indices
    column_ind = np.argsort(mean_matrix, axis=1)[:, ::-1] # descending order
    
    # get the words from column indices
    top_words = vocab[column_ind].tolist()[0][:n]
    return top_words

### run function

In [21]:
%%time
#opinions = ['1722', '1723', '1724']
top_words_from_mean = top_k_words_from_mean_vector(cluster_3_mod, 20, tfidf_matrix, op_id_to_bow_id, vocab)
print top_words_from_mean

[u'court', u'state', u'act', u'unit', u'case', u'v', u'upon', u'said', u'contract', u'offic', u'law', u'made', u'shall', u'defend', u'plaintiff', u'error', u'duti', u'u', u'section', u'claim']
Wall time: 449 ms


# Summarize Cluster Function 3
compute the mean tf-idf vector of the cluster and also of the complement of the cluster,  
take the difference mu_cluster - mu_complement, return the top K words in this difference

In [71]:
def top_k_words_from_difference(opinions, all_opinions, num_words, tfidf_matrix, op_id_to_bow_id, vocab):
    '''
    compute the mean tf-idf vector of the cluster and also of the complement of the cluster, 
    take the difference mu_cluster - mu_complement, return the top K words in this difference    
    '''
    
    # op_id_to_bow_id['opinion_id'] = 'row_index'
    
    vocab = np.array(vocab)
    n = num_words
    row_indices = []
    
    # get row indices corresponding to the opinions
    for each_opinion in opinions:
        row_index = op_id_to_bow_id[each_opinion]
        row_indices.append(row_index)
    
    # construct a matrix with rowss (opinions) from cluster
    cluster_matrix = tfidf_matrix[row_indices, :]

    # to take the mean of each col (use axis=1 to take mean of each row)
    mean_matrix = cluster_matrix.mean(axis=0) # 1 X 567570 row matrix
    
    
    
    # complement of cluster (all the other opinions)
    opinions_compl = [x for x in all_opinions if x not in opinions]
    
    # get row indices corresponding to complement of cluster
    row_indices_compl = []
    for each_opinion in opinions_compl:
        row_index = op_id_to_bow_id[each_opinion]
        row_indices_compl.append(row_index)
    
    # construct a matrix with rows (opinions) from complement of cluster
    compl_matrix = tfidf_matrix[row_indices_compl, :]
    
    # to take the mean of each col (use axis=1 to take mean of each row)
    mean_matrix_compl = compl_matrix.mean(axis=0) # 1 X 567570 row matrix
    
    
    
    # mu_cluster - mu_complement
    final_mean_matrix = mean_matrix - mean_matrix_compl
    
    # get the column indices
    column_ind = np.argsort(final_mean_matrix, axis=1)[:, ::-1] # descending order
    
    # get the words from column indices
    top_words = vocab[column_ind].tolist()[0][:n]
    
    return top_words

### Get list of all text files (all opinions)

In [None]:
def all_opinions(file_paths):
    '''
    Get list of all opinions/text files from the (.txt) file paths
    '''
    
    all_opinions = []
    for i in file_paths:
        num = re.search(r'(\d+)', i)
        num = num.group()
        all_opinions.append(num)
    
    # sort the list
    all_opinions = map(int, all_opinions) # convert all elements of list into type(int)
    all_opinions.sort()
    
    # convert list back to list of strings
    all_opinions = map(str, all_opinions)
    
    return all_opinions

In [22]:
all_the_opinions = all_opinions(file_paths)
print all_the_opinions[0]
print all_the_opinions[-1]
print type(all_the_opinions[0])

1722
4023639
<type 'str'>


### run function

In [24]:
%%time
#opinions = ['1722', '1723', '1724']
top_words_from_diff = top_k_words_from_difference(cluster_3_mod, all_the_opinions, 20, tfidf_matrix, op_id_to_bow_id, vocab)
print top_words_from_diff

[u'indict', u'offic', u'collector', u'duti', u'unit', u'contract', u'navi', u'claimant', u'treasuri', u'act', u'servic', u'depart', u'articl', u'cent', u'shall', u'apprais', u'govern', u'section', u'charg', u'made']
Wall time: 3.89 s


# Summarize Cluster Function 4
compute the mean tf-idf vector, return the document in the cluster closet to the mean  

In [132]:
def document_closest_to_mean(opinions, tfidf_matrix, op_id_to_bow_id):
    '''
    compute the mean tf-idf vector, return the document in the cluster closet to the mean  
    '''
    
    # op_id_to_bow_id['opinion_id'] = 'row_index'

    row_indices = []
    
    # get row indices corresponding to the opinions
    for each_opinion in opinions:
        row_index = op_id_to_bow_id[each_opinion]
        row_indices.append(row_index)
    
    # construct a matrix with rows (opinions) from cluster
    new_matrix = tfidf_matrix[row_indices, :]
    
    # to take the mean of each col (use axis=1 to take mean of each row)
    mean_matrix = new_matrix.mean(axis=0) # 1 X 567570 row matrix
    
    # convert to vector (since row matrix)
    mean_vector = np.squeeze(np.asarray(mean_matrix))
    
    # get the euclidean distance between mean vector and all other cluster, row vectors
    euc_dist = {}
    for i in row_indices:
        row_vector = np.squeeze(np.asarray(tfidf_matrix[i].toarray()))
        euc_dist[i] = np.linalg.norm(mean_vector-row_vector)
    
    # get row index closest to mean vector (minimum euclidian distance to mean vector)
    row_index_close = min(euc_dist, key=euc_dist.get)
    
    # get opinion closest to mean vector
    for opinion, row_index in op_id_to_bow_id.iteritems():
        if row_index == row_index_close:
            return opinion

### run function

In [25]:
%%time
#opinions = ['1722', '1723', '1724']
most_relev_op = document_closest_to_mean(cluster_3_mod, tfidf_matrix, op_id_to_bow_id)
print "opinion", most_relev_op

opinion 86062
Wall time: 7.56 s
