# Does Australian Legislation Fall into Well Defined Subgroups?
This notebook uses Tfidf, cosine_simmilarity and KMeans to plot Australian legislation into categories for further processing. The first half of this notebook was adapted from:
http://jonathansoma.com/lede/algorithms-2017/classes/clustering/k-means-clustering-with-scikit-learn/

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.manifold import MDS

import re
import string

import matplotlib.pyplot as plt
import matplotlib as mpl
import mpld3

from textblob import TextBlob
import os  # for os.path.basename
import pandas as pd

pd.set_option("display.max_columns", 50)
%matplotlib inline



All the acts have been scraped and saved. These are loaded again for processing

In [2]:
all_acts=pd.read_csv("acts/20200625-22-08-14_acts_unblank.csv")

In [3]:
len(all_acts)

1119

The code below is used during coding to run a fraction of the total acts downloaded to save time and resources.

In [256]:
rand_acts=all_acts.sample(frac=1, replace=False, random_state=1, axis=0)
rand_acts.columns=['title','text']
rand_acts.head(10)

Unnamed: 0,title,text
220,Commercial Broadcasting (Tax) Act 2017,Commercial Broadcasting (Tax) A...
678,National Health Act 1953,"National Health Act 1953 No. 95, 1953 Compi..."
785,Pay-roll Tax (State Taxation of Commonwealth A...,Pay‑roll Tax (State Taxation of C...
868,Qantas Sale Act 1992,"Qantas Sale Act 1992 No. 196, 1992 Compilat..."
934,Security of Critical Infrastructure Act 2018,Security of Critical Infrastruc...
474,Healthcare Identifiers Act 2010,"Healthcare Identifiers Act 2010 No. 72, 201..."
972,Stronger Futures in the Northern Territory Act...,Stronger Futures in the Northern Territory ...
101,Australian Bureau of Statistics Act 1975,Australian Bureau of Statistics Act 1975 No...
581,Life Insurance Supervisory Levy Imposition Act...,Life Insurance Supervisory Levy Imposition ...
90,Australia Act 1986,"Australia Act 1986 No. 142, 198..."


In [257]:
len(rand_acts)

1119

In [258]:
texts = rand_acts["text"].tolist()
len(texts)

1119

In [259]:
titles=rand_acts["title"].tolist()
len(titles)

1119

The text must be cleaned. This includes lemmatizing the words. 

In [260]:
def clean(text):
    for i in range(2):
        #Make text lowercase, remove text in square brackets,remove links,remove punctuation
        #and remove words containing numbers.
        text = str(text).lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
        text = re.sub('\xa0\xa0','',text)
        text = re.sub('\xa0',' ',text)
        text = re.sub('  ',' ',text)
        text = " ". join([w.lemmatize() for w in TextBlob(text).words])
        return text

In [261]:
for i in range(len(texts)):
    texts[i]=clean(texts[i])

In [262]:
def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

## TFIDF
The TfidfVectorizer is intialised with the tokenizer above and the document frequencies to ignore in max and min df.

In [263]:
## Vectorize and save into a new dataframe
vec = TfidfVectorizer(tokenizer=textblob_tokenizer, 
                      lowercase=False, 
                      max_df=.7, 
                      min_df=0.01,  
                      use_idf=True)
#max_df and min_df apply to document frequency NOT tfidf
# stop_words='english',

%time tfidf_matrix = vec.fit_transform(texts)
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vec.get_feature_names())
df_tfidf.head(10)

Wall time: 23min 51s


Unnamed: 0,aa,aaa,aat,ab,abandon,abbrevi,abet,abil,abl,abn,abod,abolish,abolit,aborigin,about,abov,abroad,abrog,absenc,absent,absolut,abstract,abstudi,abus,ac,...,—simplifi,—special,—staff,—standard,—state,—superannu,—suspens,—tax,—term,—that,—the,—to,—transfer,—transit,—treatment,—uncommenc,—use,—variat,—warrant,—when,—within,—work,‘,•,…
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086401
1,0.002888,0.0,0.0,0.001105,0.0,0.000998,0.000433,0.0,0.001302,0.0,0.0,0.0,0.000444,0.0,0.006978,0.0,0.0,0.0,0.002288,0.000644,0.0,0.001074,0.0,0.001921,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001373,0.002828,0.000553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001198,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.007089,0.003077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.008868,0.008382,0.0111,0.011938,0.0,0.0,0.0,0.0,0.006967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001929,0.0,0.0,0.0,0.009204,0.0,0.008019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.002247,0.002926,0.000922,0.000732,0.001313,0.0,0.0,0.0,0.0,0.01111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.008124,0.0,0.0,0.00412,0.001788,0.0,0.0,0.0,0.0,0.0,0.0,0.037886,0.034415,0.001325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005945,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.024676,0.0,0.0,0.0,0.0,0.0,0.0,0.010979,0.0,0.029449,0.0,0.0,0.0,0.021203,0.007962,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.037273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120611,0.0,0.0,0.011935,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04533,0.0,0.0


Checking if the stop words found by Tfidf seem reasonable. Endnot seems like it should have been incorporated in stop words. 

In [264]:
for word in ['the','or','is','they','was','endnot']: 
    print(word in vec.stop_words_)

True
True
True
False
False
False


Endnote seems to be in the featureset along with they. This seems strange and will need further analysis.

In [265]:
for word in ['the','or','is','they','was','endnot']: 
    print(word in vec.get_feature_names())

False
False
False
True
False
True


endnot has a relatively high tfidf on average. 

In [266]:
import numpy as np
df_nan = df_tfidf['endnot'].replace(0, np.NaN)
df_nan.mean()

0.10718569041184905

## Calculating the distance between vectors representing each Act
The distance of an act from every other act is calculated using cosine_similarity and will be used to cluster the data.
The code below was adapted from:
http://brandonrose.org/clustering

In [267]:
dist = 1 - cosine_similarity(tfidf_matrix)

In [268]:
len(dist)

1119

In [284]:
num_clusters = 10

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

Wall time: 3min 7s


In [299]:
len(clusters)

1119

In [286]:
type(km.cluster_centers_)

numpy.ndarray

In [287]:
#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

The order_centroids code below seems to have a conceptual error. Finding the argmax (via argsort in reverse) of the coordinates of each cluster is not the most accurate way of finding the terms which define the cluster. The cluster_centers atribute is simply a vector with elements equal to all the features (words) in vec and thier corresponding value at which the centroid is. A larger coordinate doesn't neccessarily indicate a stronger correlation with the uniquness (tfidf) of the word in the cluster.

Let's try running tfidf again after breaking the corpus into cluster groups of text.

In [288]:
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
# find the vocabulart and invert it so that the word numbers are keys and the words are values
vocab_inv = dict([(value, key) for key, value in vec.vocabulary_.items()]) 

##cluster center is the centroid of the cluster with the words in columns representing the location of the centroid
# just because certain words define the centroid, doesn't mean they have large tfidf values
# why would a large value in the cetroid location be important? Why use asgsort() in decending order?
# Almost need to rerun tfidf on clusters to find best words that distinguish clusters
# first tfidf finds words which distinguish acts from the corpus. Then categorise acts by clusters. Then tfidf the clusters


## Need a step here to find the highest tfidf values in each cluster

# Find the top 'n' words in each cluster (cluster_names)
# cluster_names={}
# for i in range(num_clusters): #for m clusters 0 through m
#     words=[]
#     for key in order_centroids[i,:6]:
#         words.append(vocab_inv[key])
        
#     cluster_names[i] = ', '.join(words)
    
# #set up colors per clusters using a dict
# cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

## Regroup text by clusters and run tfidf again.
THis might give a more meaningful list of terms to use to define each cluster.

In [289]:
clustered_text=['']*num_clusters
for i in range(num_clusters): #for m clusters 0 through m
    clustered_text[i]=''
    for j in range(len(texts)):
        if clusters[j]==i:
            clustered_text[i] += texts[j]+' '

In [291]:
%time tfidf_clustered_matrix = vec.transform(clustered_text)
datfram_tfidf_clustered_matrix = pd.DataFrame(tfidf_clustered_matrix.toarray(), columns=vec.get_feature_names())
datfram_tfidf_clustered_matrix.head(10)

Wall time: 22min 38s


Unnamed: 0,aa,aaa,aat,ab,abandon,abbrevi,abet,abil,abl,abn,abod,abolish,abolit,aborigin,about,abov,abroad,abrog,absenc,absent,absolut,abstract,abstudi,abus,ac,...,—simplifi,—special,—staff,—standard,—state,—superannu,—suspens,—tax,—term,—that,—the,—to,—transfer,—transit,—treatment,—uncommenc,—use,—variat,—warrant,—when,—within,—work,‘,•,…
0,0.003567,0.0,0.0,0.001517,0.0,0.098102,0.0,0.0,0.0,0.0,0.0,0.000717,0.003658,0.002191,0.130348,0.001322,0.0,0.0,0.005494,0.002211,0.0,0.0,0.0,0.0,0.0,...,0.001394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001589,0.0,0.001552,0.000759,0.002855,0.009648,0.0,0.040551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006574,0.0
1,0.002996,9.5e-05,0.001277,0.000836,0.003099,0.008075,0.000667,0.001626,0.003376,0.0,0.000109,0.00072,0.000576,0.037369,0.023617,0.011371,0.001163,0.000417,0.005493,0.003264,0.004139,0.000696,0.0,0.002725,0.000508,...,0.001646,0.000566,0.000236,0.000648,0.000258,0.0,0.000104,0.000105,0.0,0.000779,0.004606,0.000403,0.000717,0.001234,0.0,0.0007,9.5e-05,0.00128,0.000111,9.8e-05,0.0,0.000516,0.00691,0.008671,0.002277
2,0.002262,0.0,0.0,0.001443,0.0,0.041045,0.0,0.0,0.000637,0.0,0.0,0.0,0.0,0.0,0.054317,0.002514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003126,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001205,0.0,0.0,0.0,0.0,0.0,0.00688,0.00161,0.0,0.002884,0.0,0.0,0.0,0.0,0.003159,0.0,0.000707,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.002225,0.000289,0.002904,0.000544,0.000348,0.003404,0.00142,0.0,0.003147,0.009561,0.000221,0.0,0.002985,0.00072,0.024243,0.000684,0.0,0.000316,0.001078,0.000634,0.001107,0.0,0.001305,0.000394,0.000316,...,0.0,0.00044,0.0,0.0,0.0,0.001345,0.0,0.001918,0.0,7.5e-05,0.002873,0.0,0.000171,0.000512,0.000442,0.000894,0.000193,0.0,0.0,0.000197,0.000398,0.000417,0.0,0.006216,0.000171
5,0.003212,0.000229,0.001056,0.00115,0.004456,0.007675,0.001015,0.002238,0.004508,0.000683,0.000263,0.000306,0.000433,0.0,0.023382,0.005137,0.0,0.000125,0.002176,0.00132,0.000576,0.0,0.0,0.0,0.000188,...,0.0,0.001048,0.000285,0.00026,0.000621,0.000534,0.0,0.0,0.0,0.000536,0.004524,0.000971,0.002233,0.004496,0.0,0.001774,0.000115,0.0,0.0,0.000352,0.000118,0.0,0.000742,0.004829,0.001015
6,0.007203,0.000423,0.028622,0.002351,0.00148,0.007511,0.003018,0.002098,0.006405,0.001468,0.000273,0.000564,0.000798,0.002691,0.045635,0.003593,0.001654,0.000578,0.005256,0.003793,0.004628,0.000483,0.006054,0.007122,0.00104,...,0.006731,0.00082,0.000481,0.000359,0.000343,6.1e-05,0.00159,0.000234,0.00052,0.001337,0.007332,0.000571,0.000631,0.00172,0.000364,0.000715,0.001638,0.001123,0.000676,0.001217,0.000573,0.000486,0.000464,0.018334,0.001332
7,0.010933,0.000453,0.003106,0.002816,0.000856,0.011563,0.00079,0.002957,0.004335,0.002658,9.5e-05,0.001723,0.002211,0.016176,0.043818,0.002746,0.000306,0.000406,0.020488,0.014522,0.000917,0.000188,0.00073,0.00138,0.001285,...,0.003204,0.001619,0.003544,0.00028,0.000714,0.000959,0.00018,0.000456,0.001867,0.001284,0.007672,0.000426,0.003975,0.007776,0.000851,0.002008,0.000247,0.000553,0.0,0.001182,0.001192,0.001874,0.004378,0.020598,0.001203
8,0.005076,0.000188,0.000629,0.001354,8.5e-05,0.002138,0.0,0.001178,0.001976,0.0,0.0,0.000501,0.000355,0.001595,0.00835,0.001796,0.000199,0.0,0.008818,0.005096,0.000472,0.0,0.007828,0.00046,0.000539,...,0.000649,0.000343,0.0,0.0,0.0,0.000219,0.0,0.0,0.0,0.00139,0.00497,0.001943,0.000166,0.000125,0.0,0.000436,0.0,0.0,0.0,0.000192,0.0,0.0,0.014143,0.000319,0.000166
9,0.000675,0.0,0.0,0.0,0.0,0.003733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006763,0.00075,0.002916,0.0,0.001337,0.000753,0.000986,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001322,0.0,0.0,0.0,0.0,0.003187,0.0,0.003075,0.0,0.0,0.0,0.0,0.0,0.0,0.00851


In [292]:
cluster_words=np.array(datfram_tfidf_clustered_matrix).argsort()[:, ::-1]

In [293]:
cluster_words

array([[1138, 1566,  529, ..., 2254, 2251, 1937],
       [2945, 1913,  246, ..., 1902,  399, 2639],
       [ 534, 1257, 1138, ..., 2425, 2424, 1937],
       ...,
       [1126, 2034, 2932, ..., 3198, 1490, 2989],
       [2381, 2372, 2633, ..., 2544, 1075, 1937],
       [ 407, 3135, 1944, ..., 1915, 1914, 1937]], dtype=int64)

In [294]:
cluster_names={}
num_words=6
for i in range(num_clusters):
    cluster_names[i]=''
    for j in range(num_words):
        cluster_names[i]+=datfram_tfidf_clustered_matrix.iloc[i,:].sort_values(ascending=False).index[j]+' '

In [295]:
cluster_names

{0: 'endnot histori chang detail about sch ',
 1: 'shall licenc articl parti area petroleum ',
 2: 'charg export endnot impos histori industri ',
 3: '‑ figures—‑ portfolio ital appropri total ',
 4: '‑ incom tax you assess entiti ',
 5: 'insur apra levi corpor adi bodi ',
 6: 'court offenc penalti notic order good ',
 7: 'employe member servic minist employ care ',
 8: 'pension payment rate retir period benefit ',
 9: 'borrow subsidi loan partner treasur money '}

endnot is still showing up as an important term in the first cluster. The other terms in the cluster also sound like they are referencing other things (change detail histori...). Could these acts be amendments or additions to existing acts?

## Building the graph

In [296]:
MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

In [300]:
#set up colors per clusters using a dict
cluster_colors = {0:'#e6194b',1:'#3cb44b',2:'#ffe119',3:'#4363d8',4:'#f58231',5:'#911eb4',	6:'#46f0f0',7:
                  '#f032e6',8:'#bcf60c',9:'#fabebe'} #create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters)) 

#group by cluster
groups = df.groupby('label')


# Plot 
fig, ax = plt.subplots(figsize=(14,6)) #set plot size
ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=8, 
                     label=cluster_names[name], mec='none', 
                     color=cluster_colors[name], alpha=0.3)
#     ax.set_aspect('auto')
#     labels = [i for i in group.title]
    
    #set tooltip using points, labels and the already defined 'css'
#    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,
#                                       voffset=10, hoffset=10, css=css)
#     #connect tooltip to fig
#     mpld3.plugins.connect(fig, tooltip, TopToolbar())    
    
    #set tick marks as blank
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    
    #set axis as blank
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)

    
ax.legend(numpoints=1,loc =0) #show legend with only one dot

mpld3.display() #show the plot

#uncomment the below to export to html
html = mpld3.fig_to_html(fig)
print(html)



<style>

</style>

<div id="fig_el91967927266647012432832"></div>
<script>
function mpld3_load_lib(url, callback){
  var s = document.createElement('script');
  s.src = url;
  s.async = true;
  s.onreadystatechange = s.onload = callback;
  s.onerror = function(){console.warn("failed to load library " + url);};
  document.getElementsByTagName("head")[0].appendChild(s);
}

if(typeof(mpld3) !== "undefined" && mpld3._mpld3IsLoaded){
   // already loaded: just create the figure
   !function(mpld3){
       
       mpld3.draw_figure("fig_el91967927266647012432832", {"width": 1008.0, "height": 432.0, "axes": [{"bbox": [0.125, 0.125, 0.775, 0.755], "xlim": [-0.8851137864198206, 0.8677725917000311], "ylim": [-0.8541355034570013, 0.852014736347906], "xdomain": [-0.8851137864198206, 0.8677725917000311], "ydomain": [-0.8541355034570013, 0.852014736347906], "xscale": "linear", "yscale": "linear", "axes": [{"position": "bottom", "nticks": 0, "tickvalues": [], "tickformat": "", "scale": "linear", "f

## Conclusion
There is a distinct group of acts which may be refering to other acts. Possibly amending them such as cluster 0, 1, 2, 3
Income tax, ofences, insurance, employment, retirement and pensions, employment and banking may be other clusters in the data.

10 clusters may be too many. 9, 5, both discuss banking. and 0,1,2,3 and 6 all refer to court proceding and other legislation.

The upper right of the graph is dominated by court and legal matters and the lower left is dominated by economic concerns. 

Increasing the depth of terms while reducing the number of clusters may provide more insight by combining clusters while still keeping their terms within larger groups. This might highlight more distinct groupings. 