<a href="https://colab.research.google.com/github/Rae-Jiang/Capstone-Topic-Modeling/blob/master/VAE/VAE_TensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Variational Inference for Topic Models 
- Tensorflow version

In [7]:
import nltk
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
import itertools,time
import sys, os
from collections import OrderedDict,Counter
from copy import deepcopy

import matplotlib.pyplot as plt
import pickle as pkl
import sys, getopt
import re
import gensim

from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
from nltk.stem import PorterStemmer
porter = PorterStemmer()

import nltk
from nltk.corpus import stopwords
words = set(nltk.corpus.words.words())
stops = set(stopwords.words('english'))

In [0]:
## data loading and preparation

In [103]:
#load data
data = pkl.load(open("data/sent_tokens.p", "rb")) #list of tokenized words in each document
bigram = gensim.models.Phrases(data)




In [127]:
def process_text(texts, with_bigram=True,filter_stopword_spacy = False, remove_non_eng = False,with_lemmatize = False, with_stem = False):
    """
    Function to process texts. Following are the steps we take:
    
    1. Stopword Removal(nltk & spacy)
    2. Collocation detection.
    3. Lemmatization (not stem since stemming can reduce the interpretability).
    4. Stem
    
    Parameters:
    ----------
    texts: Tokenized texts.
    
    Returns:
    -------
    texts: Pre-processed tokenized texts.
    """
    # stopword removal using NLTK's english stopwords dataset remove non-english words.
    texts = [[word for word in line if word not in stops and word.isalpha() ] for line in texts] 
    if remove_non_eng:
        texts = [[word for word in line if word.lower() in words] for line in texts]
    # Bigram collocation detection (frequently co-occuring tokens) using gensim's Phrases. can even try trigram collocation detection.
    if with_bigram:
        texts = [bigram[line] for line in texts]
    #lemmatization (using gensim's lemmatize) to only keep the nouns. Lemmatization is generally better than stemming in the case of topic modeling since the words after lemmatization still remain understable. However, generally stemming might be preferred if the data is being fed into a vectorizer and isn't intended to be viewed. 
    if with_lemmatize:
        texts = [[lemmatizer.lemmatize(word) for word in line] for line in texts] 
    if with_stem:
        texts = [[porter.stem(word) for word in line] for line in texts] 
    if filter_stopword_spacy:
        texts = [[word for word in line if not nlp.vocab[word].is_stop] for line in texts] 
    return texts

data0 = process_text(data, with_bigram=True)
data1 = process_text(data, with_bigram = True, filter_stopword_spacy = True)
data2 = process_text(data, with_bigram = True, filter_stopword_spacy = True, remove_non_eng = True)
data3 = process_text(data, with_bigram = True, filter_stopword_spacy = True, remove_non_eng = False, with_lemmatize = True)
data4 = process_text(data, with_bigram = True, filter_stopword_spacy = True, remove_non_eng = True, with_lemmatize = True)
data5 = process_text(data, with_bigram = True, filter_stopword_spacy = True, remove_non_eng = False,with_lemmatize = False, with_stem = True)



KeyboardInterrupt: ignored

In [0]:
#compare
pd.DataFrame(list(zip(data[0][:20],data0[0][:20],data1[0][:20],data2[0][:20],data3[0][:20],data4[0][:20],data5[0][:20])), 
               columns =['original', 'with_bigram','filter_stopword_spacy','remove_non_eng','with_lemmatize','remove_non_eng&with_lemmatize','with_stem'])


Unnamed: 0,original,with_bigram,filter_stopword_spacy,remove_non_eng,with_lemmatize,remove_non_eng&with_lemmatize,with_stem
0,charlotte,charlotte,charlotte,reaching,charlotte,reaching,charlott
1,hilton,hilton,hilton,goal,hilton,goal,hilton
2,andersen,andersen,andersen,actually,andersen,actually,andersen
3,dec,dec,dec,switch,dec,switch,dec
4,19,reaching,reaching,default,reaching,default,reach
5,you,goal,goal,setting,goal,setting,goal
6,’re,actually,actually,critical,actually,critical,actual
7,not,make,switch,curious,switch,curious,switch
8,reaching,one,default,stop,default,stop,default
9,your,switch,setting,trying,setting,trying,set


In [273]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
basedir = '/content/drive/My Drive/1006 Capstone/Rui'
data_folder_path = os.path.join(basedir, 'data')
res_folder_path = os.path.join(basedir, 'result')

In [0]:
pkl.dump(data0, open(os.path.join(data_folder_path,'sent_tokens_bi.p'),'wb'))
pkl.dump(data1, open(os.path.join(data_folder_path,'sent_tokens_bi_spacy.p'),'wb'))
pkl.dump(data2, open(os.path.join(data_folder_path,'sent_tokens_bi_spacy_eng.p'),'wb'))
pkl.dump(data3, open(os.path.join(data_folder_path,'sent_tokens_bi_spacy_lem.p'),'wb'))

In [0]:
data = pkl.load(open('data/sent_tokens.p','rb'))
data0= pkl.load(open('data/sent_tokens_bi.p','rb'))
data1= pkl.load(open('data/sent_tokens_bi_spacy.p','rb'))
data2= pkl.load(open('data/sent_tokens_bi_spacy_eng.p','rb'))
data3= pkl.load(open('data/sent_tokens_bi_spacy_lem.p','rb'))

### train test split & vocabulary prep

In [0]:
data = data0 #basic preprocessed
# split dataset into train and test set
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

all_tokens = []
for i in train:
    all_tokens += i


In [0]:
def build_vocab(all_tokens, max_vocab_size):
    '''
    Returns:
     id2token, token2id
    ''' 
    token_counter = Counter(all_tokens)
    # unzip the vocab and its corresponding count
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    # Give indices from 2 to the vocab
    token2id = dict(zip(vocab, range(2, 2+len(vocab))))
    # Add pad and unk to vocab
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token


In [267]:
max_vocab_size = 15000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

token2id, id2token = build_vocab(all_tokens,max_vocab_size) 
id2token[:20]

['<pad>',
 '<unk>',
 'one',
 'said',
 'would',
 'also',
 'time',
 'people',
 'new',
 'like',
 'may',
 'get',
 'use',
 'could',
 'first',
 'make',
 'information',
 'us',
 'even',
 'two']

In [0]:
def token2index_dataset(tokens_data, token2id, id2token):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

x_train = token2index_dataset(train, token2id, id2token)
x_train = np.array([np.array(document) for document in x_train])

In [0]:
def onehot(data, min_length):
    return np.bincount(data, minlength=min_length)

x_train = np.array([onehot(doc.astype('int'),max_vocab_size+2) for doc in x_train if np.sum(doc)!=0])
n_samples_tr = x_train.shape[0]
docs_tr = x_train

In [0]:
x_val = token2index_dataset(test, token2id, id2token)
x_val = np.array([np.array(document) for document in x_val])
x_val = np.array([onehot(doc.astype('int'),max_vocab_size+2) for doc in x_val if np.sum(doc)!=0])
n_samples_te = x_val.shape[0]
docs_te = x_val

In [0]:
## Model and Train

In [0]:
slim = tf.contrib.slim

tf.reset_default_graph()

class VAE(object):
    """
    See "Auto-Encoding Variational Bayes" by Kingma and Welling for more details.
    """


    def __init__(self, network_architecture, transfer_fct=tf.nn.softplus,
                 learning_rate=0.01, batch_size=100):
        self.network_architecture = network_architecture
        self.transfer_fct = transfer_fct

        self.learning_rate = learning_rate
        self.batch_size = batch_size
        print('Initial Learning Rate:', self.learning_rate)

        # tf Graph input
        self.x = tf.placeholder(tf.float32, [None, network_architecture["n_input"]], name='input')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        self.h_dim = (network_architecture["n_z"]) # had a float before
        self.a = 1*np.ones((1 , self.h_dim)).astype(np.float32)                         # a    = 1
        self.prior_mean = tf.constant((np.log(self.a).T-np.mean(np.log(self.a),1)).T)          # prior_mean  = 0
        self.prior_var = tf.constant(  ( ( (1.0/self.a)*( 1 - (2.0/self.h_dim) ) ).T +       # prior_var = 0.99 + 0.005 = 0.995
                                ( 1.0/(self.h_dim*self.h_dim) )*np.sum(1.0/self.a,1) ).T  )
        self.prior_logvar = tf.log(self.prior_var)
        self.means = []

        self._create_network()
        with tf.name_scope('cost'):
            self._create_loss_optimizer()

        init = tf.initialize_all_variables()

        self.sess = tf.InteractiveSession()
        self.sess.run(init)

    def _create_network(self):
        """
        steps:
        1. initialize weights
        2. build recognition network
        3. build reconstruction network
        """
        n_z = self.network_architecture['n_z']
        n_hidden_gener_1 = self.network_architecture['n_hidden_gener_1']
        en1 = slim.layers.linear(self.x, self.network_architecture['n_hidden_recog_1'], scope='FC_en1')
        en1 = tf.nn.softplus(en1, name='softplus1')
        en2 = slim.layers.linear(en1,    self.network_architecture['n_hidden_recog_2'], scope='FC_en2')
        en2 = tf.nn.softplus(en2, name='softplus2')
        en2_do = slim.layers.dropout(en2, self.keep_prob, scope='en2_dropped')
        self.posterior_mean   = slim.layers.linear(en2_do, self.network_architecture['n_z'], scope='FC_mean')
        self.posterior_logvar = slim.layers.linear(en2_do, self.network_architecture['n_z'], scope='FC_logvar')
        self.posterior_mean   = slim.layers.batch_norm(self.posterior_mean, scope='BN_mean')
        self.posterior_logvar = slim.layers.batch_norm(self.posterior_logvar, scope='BN_logvar')
        
        with tf.name_scope('z_scope'):
            eps = tf.random_normal((self.batch_size, n_z), 0, 1,                            # take noise
                                   dtype=tf.float32)
            self.z = tf.add(self.posterior_mean,
                            tf.multiply(tf.sqrt(tf.exp(self.posterior_logvar)), eps))         # reparameterization z
            self.posterior_var = tf.exp(self.posterior_logvar) 

        self.p = slim.layers.softmax(self.z)
        p_do = slim.layers.dropout(self.p, self.keep_prob, scope='p_dropped')               # dropout(softmax(z))
        decoded = slim.layers.linear(p_do, n_hidden_gener_1, scope='FC_decoder')

        self.x_reconstr_mean = tf.nn.softmax(slim.layers.batch_norm(decoded, scope='BN_decoder'))                    # softmax(bn(50->1995))

        print(self.x_reconstr_mean)

    def _create_loss_optimizer(self):

        tensor = self.x * tf.log(self.x_reconstr_mean+1e-10)                                                   # prevent log(0)
        indices = [i for i in range(1,tensor.shape[1])] # exclude 'pad', include 'unk'
        result = tf.gather(tensor, indices, axis=1)
        NL = -tf.reduce_sum(result, 1)
        # NL = -tf.reduce_sum(self.x * tf.log(self.x_reconstr_mean+1e-10), 1)     # cross entropy on categorical:- sum(ylog(p))

        var_division    = self.posterior_var  / self.prior_var
        diff            = self.posterior_mean - self.prior_mean
        diff_term       = diff * diff / self.prior_var
        logvar_division = self.prior_logvar - self.posterior_logvar
        KLD = 0.5 * (tf.reduce_sum(var_division + diff_term + logvar_division, 1) - self.h_dim )

        self.cost = tf.reduce_mean(NL + KLD)
 
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,beta1=0.99,epsilon=0.01).minimize(self.cost)

    def partial_fit(self, X):

        #if hasattr(self, 'decoder_weight'):
            #decoder_weight = self.decoder_weight
        #else:
        decoder_weight = [v for v in tf.global_variables() if v.name=='FC_decoder/weights:0'][0]
        opt, cost,emb,p = self.sess.run((self.optimizer, self.cost, decoder_weight, self.p),feed_dict={self.x: X,self.keep_prob: .8})
        # print(self.sess.run((self.p),feed_dict={self.x: X,self.keep_prob: .8}))
        return cost,emb,p

    def test(self, X):
        """Test the model and return the lowerbound on the log-likelihood.
        """
        cost = self.sess.run((self.cost),feed_dict={self.x: np.expand_dims(X, axis=0),self.keep_prob: 1.0})
        return cost
    def topic_prop(self, X):
        """heta_ is the topic proportion vector. Apply softmax transformation to it before use.
        """
        theta_ = self.sess.run((self.z),feed_dict={self.x: np.expand_dims(X, axis=0),self.keep_prob: 1.0})
        return theta_

In [0]:
def create_minibatch(data,batch_size):
    rng = np.random.RandomState(10)

    while True:
        # Return random data samples of a size 'minibatch_size' at each iteration
        ixs = rng.randint(data.shape[0], size=batch_size)
        yield data[ixs]

In [0]:
def print_top_words(beta, feature_names, n_top_words=10):
    print('---------------Printing the Topics------------------')
    for i in range(len(beta)):
        print(" ".join([feature_names[j]
            for j in beta[i].argsort()[:-n_top_words - 1:-1]]))
    print('---------------End of Topics------------------')

In [0]:
def cal_val_ppl(model,val):
    cost=[]
    for doc in val:
        doc = doc.astype('float32')
        n_d = np.sum(doc[1:]) # count non-pad
        c=model.test(doc)
        if n_d==0:
            continue
        else:
            cost.append(c/n_d)
    # print('The approximated perplexity for test set is: ',np.exp(np.mean(np.array(cost))))
    return np.exp(np.mean(np.array(cost)))

In [0]:
def model_train(network_architecture, minibatches, val, learning_rate=0.01,
          batch_size=200, training_epochs=100, display_step=5):
    tf.reset_default_graph()
    vae = VAE(network_architecture,transfer_fct=tf.nn.softplus,
                learning_rate=learning_rate, batch_size=batch_size)
    writer = tf.summary.FileWriter('logs', tf.get_default_graph())
    emb=0
    # Training cycle
    best_val_ppl = 10000
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(n_samples_tr / batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_xs = next(minibatches)
            # Fit training using batch data
            cost,emb,p = vae.partial_fit(batch_xs)
            # Compute average loss
            avg_cost += cost / n_samples_tr * batch_size

            if np.isnan(avg_cost):
                print(epoch,i,np.sum(batch_xs,1).astype(np.int),batch_xs.shape)
                print('Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.')
                # return vae,emb
                sys.exit()
        # record best val ppl
        val_ppl = cal_val_ppl(vae,val)
        best_val_ppl = min(best_val_ppl,val_ppl)
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), \
                  "avg train cost=", "{:.9f} approximated test PPL is: {:.9f}".format(avg_cost,val_ppl))
            
    return vae,emb,best_val_ppl

In [0]:
def search_best_params(data, max_vocab_size, learning_rate, batch_size, layer1, layer2, num_topics, epochs):
    #train,test split 
    train, test = train_test_split(data, test_size=0.2)
    #build vocab for train and index for train,test
    all_tokens = []
    for i in train:
        all_tokens += i
    token2id, id2token = build_vocab(all_tokens,max_vocab_size)

    x_train = token2index_dataset(train, token2id, id2token)
    x_train = np.array([np.array(document) for document in x_train]) 
    x_train = np.array([onehot(doc.astype('int'),max_vocab_size+2) for doc in x_train if np.sum(doc)!=0])
    n_samples_tr = x_train.shape[0]
    x_val = token2index_dataset(test, token2id, id2token)
    x_val = np.array([np.array(document) for document in x_val])
    x_val = np.array([onehot(doc.astype('int'),max_vocab_size+2) for doc in x_val if np.sum(doc)!=0])

    #collate batches
    tf.reset_default_graph()
    network_architecture = \
        dict(n_hidden_recog_1=layer1, # 1st layer encoder neurons
             n_hidden_recog_2=layer2, # 2nd layer encoder neurons
             n_hidden_gener_1=x_train.shape[1], # 1st layer decoder neurons
             n_input=x_train.shape[1], # MNIST data input (img shape: 28*28)
             n_z=num_topics)  # dimensionality of latent space

    minibatches = create_minibatch(x_train.astype('float32'),batch_size=batch_size)

    return model_train(network_architecture, minibatches,x_val,learning_rate,batch_size, training_epochs=epochs, display_step=5)


In [0]:
#tune hyper-params and measure time efficiency
import time
res = []
list_data = [data,data0,data1,data2,data3]
for i in range(len(list_data)):
    DATA = list_data[i]
    for MAX_VOCAB in list(range(5000,17000,2000)):
        for lr in [0.01,0.008,0.005,0.002]:
            for BATCH_SIZE in [64,128,160,200]:
                for LAYER1 in [60,100,200,400,]:
                    if LAYER1 == 60:
                        LAYER2 = int(LAYER1//1.5)
                    else:
                        LAYER2 = LAYER1//2
                    time1 = time.time()
                    vae, emb, best_val_ppl = search_best_params(DATA, MAX_VOCAB, learning_rate=lr,
                                                    batch_size=BATCH_SIZE,layer1=LAYER1, layer2=LAYER2, num_topics=20, epochs=100)
                    time_span = time.time() - time1
                    print('best_val_ppl:',best_val_ppl)
                    record = \
                        dict(dataset=i-1, 
                             max_vocab_size=MAX_VOCAB,
                             learning_rate=lr, 
                             batch_size=BATCH_SIZE, 
                             layer1_size = LAYER1,
                             layer2_size = LAYER2,
                             vae = vae, emb = emb, 
                             best_val_ppl = best_val_ppl,
                             time = time_span) 
                    res.append(record)

pkl.dump(res, open(os.path.join(res_folder_path,'res_tune.p'),'wb'))

Initial Learning Rate: 0.01
Tensor("Softmax_1:0", shape=(64, 5002), dtype=float32)




Epoch: 0001 avg train cost= 2619.614476562 approximated test PPL is: 2710.492919922
Epoch: 0006 avg train cost= 2055.112280273 approximated test PPL is: 560.703186035
Epoch: 0011 avg train cost= 1991.450687500 approximated test PPL is: 538.786560059
Epoch: 0016 avg train cost= 2010.081862305 approximated test PPL is: 530.249633789
Epoch: 0021 avg train cost= 1975.214496094 approximated test PPL is: 530.424621582
Epoch: 0026 avg train cost= 1942.368915039 approximated test PPL is: 523.732788086
Epoch: 0031 avg train cost= 1993.663530273 approximated test PPL is: 529.871765137
Epoch: 0036 avg train cost= 1955.361846680 approximated test PPL is: 534.825439453
Epoch: 0041 avg train cost= 1931.640883789 approximated test PPL is: 533.190734863
Epoch: 0046 avg train cost= 1930.169207031 approximated test PPL is: 533.718811035
Epoch: 0051 avg train cost= 1937.376344727 approximated test PPL is: 537.507385254
Epoch: 0056 avg train cost= 1943.940034180 approximated test PPL is: 540.214050293
Epo



Epoch: 0001 avg train cost= 2745.413977539 approximated test PPL is: 2710.861328125
Epoch: 0006 avg train cost= 2089.366898437 approximated test PPL is: 634.089050293
Epoch: 0011 avg train cost= 1992.422839844 approximated test PPL is: 502.629791260


In [0]:
def gen_minibatch_whole_train(data):
    c = 0
    while True:
        # Return random data samples of a size 'minibatch_size' at each iteration
        c += 1
        yield data[200*(c-1):200*c]

In [0]:
batch_size=200
minibatches = gen_minibatch_whole_train(docs_tr.astype('float32'))
whole_train = int(n_samples_tr / batch_size)
x_var_mean = []
for i in range(whole_train):
    batch_xs = next(minibatches)
    # Fit training using batch data
    cost,emb,p = vae.partial_fit(batch_xs)
    x_var_mean += p.tolist()

In [0]:
x_var_mean = np.array(x_var_mean)

In [0]:
def top_words(beta, feature_names, n_top_words=10):
    res = []
    for i in range(len(beta)):
        res.append([feature_names[j] for j in beta[i].argsort()[:-n_top_words - 1:-1]])
    return res

In [0]:
topic_words = top_words(emb, list(zip(*sorted(token2id.items(), key=lambda x: x[1])))[0])

In [0]:
print_top_words(emb, list(zip(*sorted(token2id.items(), key=lambda x: x[1])))[0])

In [0]:
from sklearn.preprocessing import scale
from sklearn.linear_model import Lasso
from tqdm import tqdm_notebook as tqdm
from matplotlib import pyplot as plt
import networkx as nx
from collections import Counter

def build_lasso_graph(x, l1_lambda, topic_words,both=True,):
    """
    A adaption from CTM's lasso-graph.r script
    
    Parameters
    ----------
    x: numpy.array
        N x K data matrix -- e.g., the variational means ("final-lambda.dat")
    l1_lambda: float
        relative bound on the l1-norm of the parameters, in [0,1]
    and: bool
        if and=T/F then the graph is computed by taking the intersction/union of the nbhds
        
    Returns
    -------
    ihat: numpy.array 
        K x K adjacency matrix of the topic graph
    """
    x = scale(x)
    topic_count = Counter(x.argmax(axis=1))
    N, K = x.shape
    Shat = np.zeros((K,K), dtype=bool)
    print('Parameters:')
    print(f'N={N}, K={K}, lambda={l1_lambda}')
    print()
    print('Fitting...')
    for j in range(K):
        column_mask = np.ones(K, dtype=bool)
        column_mask[j] = False
        # The response is the j-th column
        y = x[:,j]
        X = x[:,column_mask]
        # Do the l1-regularized regression
        # Note: the bound in l1ce code is the upper bound on the l1
        # norm.  So, a larger bound is a weaker constraint on the model
        lasso_model = Lasso(
            normalize=False,
            alpha=l1_lambda,
            tol=1e-6,
            #positive=True,
            max_iter=10000,
        )
        lasso_model.fit(X, y)
        indices = np.array(range(K))[column_mask]
        beta = lasso_model.coef_
        nonzero = indices[beta>0]
        Shat[j, nonzero] = True
        Shat[j,j] = True
    print('Fitting completed!')
    # Include an edge if either (and=F) or both (and=T) endpoints are neighbors
    Ihat = np.zeros((K,K), dtype=bool)
    if both is True:
        for i in range(K):
            Ihat[:, i] = Shat[:,i]&Shat[i,:]
    else:
        for i in range(K):
            Ihat[:, i] = Shat[:,i]|Shat[i,:]
    # Visualize topic graph
    # Construct Graph from adjacency matrix
    G = nx.from_numpy_array(Ihat)
    annotations = [f'({np.round(topic_count[i]*100/N, 2)}%)\n'+"\n".join(topic_word) for i, topic_word in enumerate(topic_words)]
    node_label_dict = dict(zip(range(20), annotations))
    pos = nx.spring_layout(G, k=0.3)
    fig = plt.figure(figsize=(40,20))
    plt.title(f'Number of topics: {K}, L1-Regularization Strength: {l1_lambda}')
    nx.draw(G, pos, font_size=20, with_labels=True, labels=node_label_dict, node_size=5)
    return Ihat, fig

In [0]:
topic_graph_adj_matrix, topic_graph_fig = build_lasso_graph(x_var_mean, 3e-1, topic_words, both=True)