In [1]:
# Import packages
import numpy as np
import gensim
import nltk
import pprint 
from gensim import corpora
from nltk.tokenize import RegexpTokenizer

In [8]:
# Load data
X  = np.load('one_hot_coco.npy')

In [9]:
# Get data statistics, as a sanity check
print("Data shape: ", X.shape)
print("Number of 0s: ", np.sum(X == 1))
print("Number of 1s: ", np.sum(X == 0))
print("Anomailes: ",np.sum([X < 0]))

# Note: number of 1s is far more than 0s. Thus, there are many 'stopword' features. Look at ways to remove/normalize.

Data shape:  (118060, 4096)
Number of 0s:  66475570
Number of 1s:  417098190
Anomailes:  0


In [10]:
# Process the raw data
doc = []
for i in range(1000): #X.shape[0]): # 118060
    doc_list = []
    for j in range(X.shape[1]): # 4096
        doc_list.append((j, X[i,j]))
    doc.append(doc_list)

In [85]:
# Define parameters for topic modelling
num_topics = 50
num_words = 4096                                          # Number of top features to be displayed per topic
model_name = str(num_topics) + '-topics.model'
topics_file_name = 'topic_model_features_' + str(num_topics) +'_topics.npy'

In [86]:
# Create the Topic Model
ldamodel = gensim.models.ldamodel.LdaModel(doc, num_topics = num_topics, passes=15)
ldamodel.save(model_name)

In [87]:
# Raw output
topic_distribution = ldamodel.print_topics(num_words=num_words)
#print (topic_distribution)

In [88]:
# Extract feature ids per topic from raw output
topics = np.zeros([num_topics, num_words])
tokenizer = RegexpTokenizer(r'\w+')

for topic_ids in topic_distribution:
    topic_id = topic_ids[0]
    all_features = topic_ids[1]

    feature_values = tokenizer.tokenize(all_features)     # Tokenize the string to keep all numbers
    feature_values = list(map(int, feature_values))       # Convert values of list to int
    feature_values = np.asarray(feature_values[2::3]) * np.asarray(feature_values[1::3])   # Remove all features with 0 weightage in topic
    
    topics[topic_id] = feature_values
        
# Save feature values in npy file
np.save(topics_file_name, topics)

#topics is a numpy array with one row representing one topic. The columns contain the indices of the features belonging to that topic, in decreasing order of how strongly linked a feature is to that topic. The vectors have been padded with 0s in the end.