In [None]:
# -*- coding: utf-8 -*-
"""
Topic modelling







"""
import pandas as pd
import matplotlib.pyplot as plt
import spacy, nltk, random, os, json
import numpy as np

# directory where file is stored
os.chdir(r'C:\research\NY')

# load in data from json
with open('NY_text.json') as file:
    data = json.load(file)
    

# load english into spacy
spacy.load('en_core_web_lg')
# raise the roof

from spacy.lang.en import English

# set parser as english
parser = English()

# download nltk wordnet
nltk.download('wordnet')

# download stopwords
nltk.download('stopwords')

# set english stopwords
en_stop = nltk.corpus.stopwords.words('english')
new_stop_words = ['come','illinois','state','shall','would','energy','hearing','million']
for i in new_stop_words:
    en_stop.append(i)
    

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora


"""
Function written to tokenize text objects
"""
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

"""
Function to get word lemma
"""
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

"""
Function to lemmatize words missed by wordnet
"""
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

"""
Function to prepare text for modelling
"""
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    tokens = [get_lemma2(token) for token in tokens]
    tokens = [token for token in tokens if token not in en_stop]
    return tokens    


# set up list for text
text_data = []

# iterate over dict to prepare corpus
for text_values in data.values():
    if len(text_values) < 1000000:
        tokens = prepare_text_for_lda(text_values)
        text_data.append(tokens)
    # Random check to view words
        if random.random() > .99:
            print(tokens)
    else:
        pass

################################################################        
# Part 2: lda
# construct corpora dictionary for gensim model from text_data, which are the tokens
dictionary = corpora.Dictionary(text_data)
# iterate doc2bow over tokens, constructs bag of words
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

import gensim
# set topic parameter
topic_amnts = [4,5,6,7,8,9]


for NUM_TOPICS in topic_amnts:
    lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                                num_topics=NUM_TOPICS,
                                                id2word=dictionary,
                                                passes=15,
                                                random_state=np.random.RandomState(seed=42),
                                                minimum_probability=0.05)
    #lda_model.save('model5.gensim')

    topics = lda_model.print_topics(num_words=20)
    print('num_topics = '+ str(NUM_TOPICS))
    for topic in topics:
        print(topic)
        
        


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pjpriole\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pjpriole\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['american', 'recovery', 'reinvestment', 'utility', 'project', 'brief', 'description', 'priority', 'scale', 'purpose', 'rationale', 'justification', 'expect', 'benefit', 'quantify', 'possible', 'create', 'retain', 'total', 'project', 'cost', 'amount', 'request', 'project', 'smart', 'disruption', 'recovery', 'shovel', 'ready', 'utility', 'submit', 'project', 'utility', 'contact', 'project', 'contact', 'email', 'smart', 'phase', 'installation', 'endpoint', 'inclusive', 'smart', 'device', 'electric', 'meter', 'distribution', 'sensor', 'smart', 'phase', 'installation', 'endpoint', 'internal', 'inclusive', 'smart', 'device', 'electric', 'meter', 'distribution', 'sensor', 'explore', 'accuracy', 'integrity', 'technology', 'available', 'relate', 'smart', 'meter', 'meter', 'platform', 'evaluate', 'benefit', 'relate', 'automation', 'manual', 'task', 'determine', 'customer', 'response', 'dynamic', 'structure', 'network', 'equipment', 'installation', 'usage', 'display', 'smart', 'appliance', 'expl




['demonstration', 'project', 'implementation', 'commercial', 'battery', 'storage', 'date', 'demonstration', 'project', 'implementation', 'commercial', 'battery', 'storage', 'table', 'content', 'section', 'demonstration', 'design', '........................................................................', 'statement', '........................................................................................................', 'population', '.........................................................................................................', 'scenario', '...........................................................................................................', 'checkpoint', '............................................................................................................', 'section', 'project', 'structure', 'governance', '....................................................', 'project', '...................................................................................

['karla', 'corpus', 'senior', 'counsel', 'regulatory', 'january', 'electronic', 'delivery', 'honorable', 'kathleen', 'burgess', 'secretary', 'public', 'service', 'commission', 'three', 'empire', 'plaza', 'floor', 'albany', '12223', '14-m-0101', 'proceeding', 'motion', 'commission', 'regard', 'reform', 'vision', 'national', 'clifton', 'demand', 'reduction', 'demonstration', 'project', 'implementation', 'please', 'direct', 'question', 'regard', 'filing', 'niagara', 'mohawk', 'power', 'corporation', 'national', 'national', 'hereby', 'secretary', 'burgess', 'submit', 'filing', 'clifton', 'demand', 'reduction', 'demonstration', 'project', 'implementation', 'require', 'demonstration', 'project', 'assessment', 'report', 'assessment', 'report', 'file', 'department', 'public', 'service', 'staff', 'staff', 'commission', 'december', '14-m-0101.1', 'philip', 'austen', 'director', 'solution', 'delivery', 'national', 'country', 'hicksville', '11801', 'mobile', 'email', 'pausten@nationalgrid.com', 'a




['public', 'service', 'commission', 'session', 'public', 'service', 'commission', 'albany', 'september', 'commissioner', 'present', 'rhodes', 'chair', 'gregg', 'sayre', 'diane', 'burman', 'james', 'alesi', '15-e-0751', 'matter', 'value', 'distribute', 'resource', 'order', 'approving', 'minor', 'tranche', 'adjustment', 'issue', 'effective', 'september', 'commission', 'introduction', 'march', 'commission', 'issue', 'phase', 'order', 'include', 'specific', 'megawatt', 'capacity', 'allocation', 'three', 'tranche', 'utility', 'territory.1', 'community', 'distribute', 'generation', 'project', 'interconnect', 'issuance', 'phase', 'order', 'assign', 'first', 'tranche', 'territory', 'interconnect', 'assignment', 'determine', 'eligibility', 'phase', 'meter', 'particular', 'market', 'transition', 'credit', 'electric', 'corporation', 'nyseg', 'company', 'file', 'petition', 'approval', '15-e-0751', 'value', 'distribute', 'resource', 'order', 'meter', 'transition', 'phase', 'value', 'distribute', 'r

In [None]:
%matplotlib
doc_lens = [len(doc) for doc in text_data]

# Plot
plt.figure(figsize=(16,7), dpi=160)
plt.hist(doc_lens, bins = 1000, color='navy')
plt.text(750, 100, "Mean   : " + str(round(np.mean(doc_lens))))
plt.text(750,  90, "Median : " + str(round(np.median(doc_lens))))
plt.text(750,  80, "Stdev   : " + str(round(np.std(doc_lens))))
plt.text(750,  70, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01))))
plt.text(750,  60, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99))))

plt.gca().set(xlim=(0, 50000), ylabel='Number of Documents', xlabel='Document Word Count')
plt.tick_params(size=16)
plt.xticks(np.linspace(0,50000,9))
plt.title('Distribution of NY Document Word Counts', fontdict=dict(size=22))
plt.show()

In [None]:
%matplotlib
import matplotlib.colors as mcolors
from collections import Counter
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in text_data for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(3, 3, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 100000)
    ax.set_title('Topic: ' + str(i+1), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout()    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

In [None]:
from gensim.models.coherencemodel import CoherenceModel
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=gensim.models.ldamodel.LdaModel(corpus=corpus,
                                              id2word=dictionary,
                                              num_topics=num_topics,
                                              passes=15,
                                              random_state=np.random.RandomState(seed=42),
                                              minimum_probability=0.05)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [2]:
%matplotlib
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=text_data, start=2, limit=50, step=1)
# Show graph
limit=50 
start=2 
step=1
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

Using matplotlib backend: Qt5Agg


NameError: name 'dictionary' is not defined

In [None]:
import pandas as pd

topics = []
for topic in lda_model.print_topics(num_words=20):
    i=0
    topic = str(topic)
    topic = topic.split('+')
    topics.append(topic)
    


topics_dict = {'topic_1':topics[0],
               'topic_2':topics[1],
               'topic_3':topics[2],
               'topic_4':topics[3],
               'topic_5':topics[4],
               'topic_6':topics[5],
               'topic_7':topics[6],
               'topic_8':topics[7],
               'topic_9':topics[8]}

results_df = pd.DataFrame(topics_dict)
results_df.to_csv('ny_results.csv')
results_df



    
    


In [None]:
# lda word cloud visualization
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors


# colors for wordcloud
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

cloud = WordCloud(stopwords=en_stop,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=15,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1)

topics = lda_model.show_topics(formatted=False)


# adjust subplots for topic numbers here!!!!!!!!!!!
# STOP
fig, axes = plt.subplots(3, 3, figsize=(10,10), sharex=True, sharey=True)

for i, ax, in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('NY Topic ' + str(i+1), fontdict=dict(size=16))
    plt.gca().axis('off')
    
plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0,y=0)
plt.tight_layout()
plt.show()
plt.savefig('NY4_topic_wc_final.png')

In [None]:
# Get topic weights and dominant topics ------------
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show, save, output_file
from bokeh.models import Label
from bokeh.io import output_notebook
import pandas as pd
import numpy as np

# Get topic weights
# n-1 rows each is a vector with i-1 posisitons, where n the number of documents
# i the topic number and tmp[i] = probability of topic i
topic_weights = []
for row_list in lda_model[corpus]:
    tmp = np.zeros(NUM_TOPICS)
    for i, w in row_list:
        tmp[i] = w
    topic_weights.append(tmp)
# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
#arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 9
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics in NY".format(n_topics), 
              plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
output_file('NY_4_topic_tsne.html')
show(plot)
save(plot)

In [None]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
pyLDAvis.save_html(lda_display,fileobj='NY_4_topic_lda_display.html')