In [2]:
import pyLDAvis
import pyLDAvis.sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

pyLDAvis.enable_notebook()

In [61]:
# raw_texts: the actual text of the files
# filenames: a list of filenames

from glob import glob
file_pattern = 'zero-carbon-bill/input/*.json'

filenames = glob(file_pattern)

raw_texts = []

for file in filenames:
    with open(file) as f:
        contents = f.read()
    raw_texts.append(contents)

In [102]:
# Converts the documents into a matrix of features
# features are interesting words
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

vectorizer = TfidfVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Zā]{3,}\b',
                                max_df = 0.5, 
                                min_df = 2,
                                input = 'filename')

# this is a DTM - document-term matrix
vectorized_data = vectorizer.fit_transform(filenames)

In [97]:
vectorizer.inverse_transform(filenames)

vectorizer.get_feature_names()

['ability',
 'able',
 'abroad',
 'absolutely',
 'absorb',
 'absorbed',
 'accelerated',
 'accelerating',
 'accept',
 'acceptable',
 'accepted',
 'accepting',
 'access',
 'accessibility',
 'accessible',
 'accord',
 'according',
 'account',
 'accountable',
 'accounting',
 'accurate',
 'achievable',
 'achieve',
 'achieved',
 'achieving',
 'acidification',
 'acknowledge',
 'act',
 'acting',
 'action',
 'actions',
 'active',
 'actively',
 'activities',
 'activity',
 'actual',
 'actually',
 'adapt',
 'adaptation',
 'adapted',
 'adapting',
 'adaption',
 'add',
 'added',
 'adding',
 'addition',
 'additional',
 'additionally',
 'address',
 'addressed',
 'addressing',
 'adds',
 'adjustment',
 'administrative',
 'adopt',
 'adopted',
 'advantage',
 'advantages',
 'adverse',
 'adversely',
 'advertised',
 'advice',
 'advisors',
 'advisory',
 'advocate',
 'affect',
 'affected',
 'affecting',
 'afford',
 'age',
 'agencies',
 'agenda',
 'ages',
 'aggressively',
 'ago',
 'agree',
 'agreement',
 'agreemen

In [98]:
lda = LatentDirichletAllocation(n_components=10, random_state=0)
lda.fit(vectorized_data)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [None]:
prepared_data = pyLDAvis.sklearn.prepare(lda, vectorized_data, vectorizer)

prepared_data