In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# raw_texts: the actual text of the files
# filenames: a list of filenames

from glob import glob
file_pattern = 'zero-carbon-bill/input/*.json'

filenames = glob(file_pattern)

raw_texts = []

for file in filenames:
    with open(file) as f:
        contents = f.read()
    raw_texts.append(contents)

In [3]:
# Converts the documents into a matrix of features
# features are interesting words
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

vectorizer = TfidfVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Zā]{3,}\b',
                                max_df = 0.5, 
                                min_df = 2,
                                input = 'filename')

# this is a DTM - document-term matrix
vectorized_data = vectorizer.fit_transform(filenames)

In [4]:
kmeans = KMeans(n_clusters=5, random_state=0).fit(vectorized_data)

In [7]:
dir(kmeans)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_test_data',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_transform',
 'algorithm',
 'cluster_centers_',
 'copy_x',
 'fit',
 'fit_predict',
 'fit_transform',
 'get_params',
 'inertia_',
 'init',
 'labels_',
 'max_iter',
 'n_clusters',
 'n_init',
 'n_iter_',
 'n_jobs',
 'precompute_distances',
 'predict',
 'random_state',
 'score',
 'set_params',
 'tol',
 'transform',
 'verbose']

In [11]:
kmeans_transformed_data = kmeans.transform(vectorized_data)

In [12]:
dir(kmeans_transformed_data)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_finalize__',
 '__array_function__',
 '__array_interface__',
 '__array_prepare__',
 '__array_priority__',
 '__array_struct__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__complex__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__ilshift__',
 '__imatmul__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__

In [14]:
kmeans_transformed_data.view()

array([[0.99431392, 1.02287337, 1.01600109, 1.00542015, 1.03882433],
       [1.00069254, 0.96673902, 0.98939516, 0.99015869, 1.02943148],
       [0.99573483, 0.99218884, 0.99627333, 0.95705974, 1.02505609],
       ...,
       [0.99091788, 0.9711786 , 0.99266263, 0.98508134, 1.02154671],
       [0.99704844, 1.00402252, 0.99241793, 0.99300352, 0.96462043],
       [0.9952054 , 1.01301809, 0.99896171, 0.98543348, 1.02798544]])

In [15]:
kmeans_transformed_data.shap\

(599, 5)

In [20]:
kmeans_transformed_data[1]

array([1.00069254, 0.96673902, 0.98939516, 0.99015869, 1.02943148])

In [19]:
raw_texts[1]

'No zero emissions, no climate change legislation without a public referendum to give a clear mandate. The current government does\n\nnot have such a mandate.\n'