In [2]:
import pyLDAvis
import pyLDAvis.sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

pyLDAvis.enable_notebook()

In [61]:
# raw_texts: the actual text of the files
# filenames: a list of filenames

from glob import glob
file_pattern = 'zero-carbon-bill/input/*.json'

filenames = glob(file_pattern)

raw_texts = []

for file in filenames:
    with open(file) as f:
        contents = f.read()
    raw_texts.append(contents)

In [102]:
# Converts the documents into a matrix of features
# features are interesting words
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

vectorizer = TfidfVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Zā]{3,}\b',
                                max_df = 0.5, 
                                min_df = 2,
                                input = 'filename')

# this is a DTM - document-term matrix
vectorized_data = vectorizer.fit_transform(filenames)

In [97]:
vectorizer.inverse_transform(filenames)

vectorizer.get_feature_names()

['ability',
 'able',
 'abroad',
 'absolutely',
 'absorb',
 'absorbed',
 'accelerated',
 'accelerating',
 'accept',
 'acceptable',
 'accepted',
 'accepting',
 'access',
 'accessibility',
 'accessible',
 'accord',
 'according',
 'account',
 'accountable',
 'accounting',
 'accurate',
 'achievable',
 'achieve',
 'achieved',
 'achieving',
 'acidification',
 'acknowledge',
 'act',
 'acting',
 'action',
 'actions',
 'active',
 'actively',
 'activities',
 'activity',
 'actual',
 'actually',
 'adapt',
 'adaptation',
 'adapted',
 'adapting',
 'adaption',
 'add',
 'added',
 'adding',
 'addition',
 'additional',
 'additionally',
 'address',
 'addressed',
 'addressing',
 'adds',
 'adjustment',
 'administrative',
 'adopt',
 'adopted',
 'advantage',
 'advantages',
 'adverse',
 'adversely',
 'advertised',
 'advice',
 'advisors',
 'advisory',
 'advocate',
 'affect',
 'affected',
 'affecting',
 'afford',
 'age',
 'agencies',
 'agenda',
 'ages',
 'aggressively',
 'ago',
 'agree',
 'agreement',
 'agreemen

In [98]:
lda = LatentDirichletAllocation(n_components=10, random_state=0)
lda.fit(vectorized_data)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [103]:
prepared_data = pyLDAvis.sklearn.prepare(lda, vectorized_data, vectorizer)

prepared_data

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [105]:
prepared_data.__str__()

"PreparedData(topic_coordinates=              x         y  topics  cluster       Freq\ntopic                                                \n0      0.138814  0.003372       1        1  24.009440\n8     -0.010006 -0.084529       2        1  12.038619\n6     -0.016653  0.041748       3        1  10.715769\n3     -0.010242  0.016721       4        1   9.433166\n5     -0.012373  0.008091       5        1   9.343609\n4     -0.016152  0.000820       6        1   7.486310\n7     -0.025872  0.012117       7        1   7.401430\n2     -0.016101 -0.013311       8        1   7.127960\n9     -0.016099  0.007079       9        1   6.266772\n1     -0.015316  0.007892      10        1   6.176925, topic_info=     Category      Freq         Term      Total  loglift  logprob\n1255  Default  4.000000          let   4.000000  30.0000  30.0000\n1200  Default  5.000000         just   5.000000  29.0000  29.0000\n722   Default  4.000000  environment   4.000000  28.0000  28.0000\n2460  Default  2.000000      

In [107]:
prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.138814,0.003372,1,1,24.00944
8,-0.010006,-0.084529,2,1,12.038619
6,-0.016653,0.041748,3,1,10.715769
3,-0.010242,0.016721,4,1,9.433166
5,-0.012373,0.008091,5,1,9.343609
4,-0.016152,0.00082,6,1,7.48631
7,-0.025872,0.012117,7,1,7.40143
2,-0.016101,-0.013311,8,1,7.12796
9,-0.016099,0.007079,9,1,6.266772
1,-0.015316,0.007892,10,1,6.176925


In [108]:
prepared_data.topic_order

[1, 9, 7, 4, 6, 5, 8, 3, 10, 2]

In [109]:
prepared_data.topic_info

Unnamed: 0,Category,Freq,Term,Total,loglift,logprob
1255,Default,4.000000,let,4.000000,30.0000,30.0000
1200,Default,5.000000,just,5.000000,29.0000,29.0000
722,Default,4.000000,environment,4.000000,28.0000,28.0000
2460,Default,2.000000,yes,2.000000,27.0000,27.0000
2393,Default,2.000000,waste,2.000000,26.0000,26.0000
...,...,...,...,...,...,...
1426,Topic10,0.338476,national,2.457653,0.8018,-6.1746
2467,Topic10,0.367136,zero,7.133848,-0.1825,-6.0934
680,Topic10,0.350186,emissions,10.165420,-0.5839,-6.1406
1447,Topic10,0.343485,new,9.051219,-0.4872,-6.1600


In [110]:
prepared_data.token_table

Unnamed: 0_level_0,Topic,Freq,Term
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0.585740,ability
27,1,0.623734,act
27,5,0.207911,act
29,1,0.581078,action
29,3,0.193693,action
...,...,...,...
2465,3,0.509967,zealanders
2467,1,0.420530,zero
2467,2,0.140177,zero
2467,5,0.140177,zero


In [111]:
prepared_data.lambda_step

0.01

In [112]:
prepared_data.topic_order

[1, 9, 7, 4, 6, 5, 8, 3, 10, 2]

In [115]:
prepared_data.doc_topic_dists

AttributeError: 'PreparedData' object has no attribute 'doc_topic_dists'