In [1]:
# Import packages
import numpy as np
import gensim
import nltk
import pprint 
import operator
from gensim import corpora
from nltk.tokenize import RegexpTokenizer

In [2]:
# Load data
X  = np.load('one_hot_coco.npy')

In [3]:
# Get data statistics, as a sanity check
print("Data shape: ", X.shape)
print("Number of 0s: ", np.sum(X == 1))
print("Number of 1s: ", np.sum(X == 0))
print("Anomailes: ",np.sum([X < 0]))

# Note: number of 1s is far more than 0s. Thus, there are many 'stopword' features. Look at ways to remove/normalize.

Data shape:  (118060, 4096)
Number of 0s:  66475570
Number of 1s:  417098190
Anomailes:  0


In [4]:
# Define parameters for topic modelling
num_topics = 2
num_words = 4096                                          # Number of top features to be displayed per topic
num_images = 1000                                         # Need better processor to make this X.shape[0]

model_name = str(num_topics) + '-topics.model'
topics_file_name = 'topic_model_features_' + str(num_topics) +'_topics.npy'
topics_per_image_file_name = 'topics_per_image-' + str(num_topics) +'_topics.npy'

In [5]:
# Process the raw data
doc = []
bow = []
for i in range(num_images):     # should be X.shape[0], 118060
    doc_list = []
    bow_list = []
    for j in range(X.shape[1]): # 4096
        doc_list.append((j, X[i,j]))
        bow_list.append(X[i,j])
    doc.append(doc_list)
    bow.append(bow_list)

In [6]:
# Create the Topic Model
#ldamodel = gensim.models.ldamodel.LdaModel(doc, num_topics = num_topics, passes=15)
#ldamodel.save(model_name)

# Or load the saved model
ldamodel = gensim.models.ldamodel.LdaModel.load("../2 topics/"+model_name)

In [10]:
# Raw output
topic_distribution = ldamodel.print_topics(num_words=num_words)

In [8]:
'''Save distribution of features for every topic'''

# Extract feature ids per topic from raw output
topics = np.zeros([num_topics, num_words])
tokenizer = RegexpTokenizer(r'\w+')

for topic_ids in topic_distribution:
    topic_id = topic_ids[0]
    all_features = topic_ids[1]

    feature_values = tokenizer.tokenize(all_features)     # Tokenize the string to keep all numbers
    feature_values = list(map(int, feature_values))       # Convert values of list to int
    feature_values = np.asarray(feature_values[2::3]) * np.asarray(feature_values[1::3])   # Remove all features with 0 weightage in topic
    
    topics[topic_id] = feature_values
        
# Save feature values in npy file
np.save(topics_file_name, topics)

#topics is a numpy array with one row representing one topic. The columns contain the indices of the features belonging to that topic, in decreasing order of how strongly linked a feature is to that topic. The vectors have been padded with 0s in the end.

In [9]:
'''Save distribution of topics, for every image'''

# Save top k topics per image to file
k = 5

topics_per_image = ldamodel[doc]
topics_per_image_matrix = np.zeros([len(topics_per_image),k,2])
print ("No of documents: ", len(topics_per_image))

i = 0
for image_topics in topics_per_image:
    image_topics.sort(key=operator.itemgetter(1), reverse=True)
    print ("Image ",i, " Topics: ", image_topics)
    for j in range(min(k,len(image_topics))):
        topics_per_image_matrix[i][j][0] = image_topics[j][0]    # Store topic id
        topics_per_image_matrix[i][j][1] = image_topics[j][1]    # Store probability of document having that topic
    i = i+1
    
# Save feature values in npy file
np.save(topics_per_image_file_name, topics_per_image_matrix)

No of documents:  1000
Image  0  Topics:  [(1, 0.5155983), (0, 0.48440167)]
Image  1  Topics:  [(0, 0.9979318)]
Image  2  Topics:  [(0, 0.6317239), (1, 0.36827615)]
Image  3  Topics:  [(0, 0.93136364), (1, 0.06863637)]
Image  4  Topics:  [(0, 0.74622107), (1, 0.25377893)]
Image  5  Topics:  [(1, 0.91404355), (0, 0.08595643)]
Image  6  Topics:  [(1, 0.7864873), (0, 0.21351267)]
Image  7  Topics:  [(1, 0.6287522), (0, 0.37124783)]
Image  8  Topics:  [(0, 0.63895065), (1, 0.36104935)]
Image  9  Topics:  [(0, 0.99784166)]
Image  10  Topics:  [(0, 0.9982269)]
Image  11  Topics:  [(1, 0.52623683), (0, 0.4737632)]
Image  12  Topics:  [(1, 0.844754), (0, 0.155246)]
Image  13  Topics:  [(1, 0.9760723), (0, 0.023927657)]
Image  14  Topics:  [(0, 0.7247123), (1, 0.27528766)]
Image  15  Topics:  [(1, 0.56107044), (0, 0.43892953)]
Image  16  Topics:  [(0, 0.9957749)]
Image  17  Topics:  [(1, 0.7789603), (0, 0.22103976)]
Image  18  Topics:  [(1, 0.81501395), (0, 0.1849861)]
Image  19  Topics:  [(0, 

Image  184  Topics:  [(1, 0.9983349)]
Image  185  Topics:  [(1, 0.5403609), (0, 0.45963904)]
Image  186  Topics:  [(0, 0.84756386), (1, 0.15243617)]
Image  187  Topics:  [(0, 0.6919452), (1, 0.30805483)]
Image  188  Topics:  [(0, 0.9238377), (1, 0.076162264)]
Image  189  Topics:  [(0, 0.7445396), (1, 0.2554604)]
Image  190  Topics:  [(0, 0.8137161), (1, 0.18628389)]
Image  191  Topics:  [(1, 0.86139345), (0, 0.13860661)]
Image  192  Topics:  [(0, 0.90678513), (1, 0.09321484)]
Image  193  Topics:  [(0, 0.6504822), (1, 0.34951788)]
Image  194  Topics:  [(0, 0.68634933), (1, 0.31365067)]
Image  195  Topics:  [(0, 0.94446504), (1, 0.05553499)]
Image  196  Topics:  [(1, 0.9985655)]
Image  197  Topics:  [(0, 0.9842467), (1, 0.015753388)]
Image  198  Topics:  [(0, 0.6362561), (1, 0.36374396)]
Image  199  Topics:  [(0, 0.9976931)]
Image  200  Topics:  [(0, 0.82177126), (1, 0.1782287)]
Image  201  Topics:  [(1, 0.5617634), (0, 0.43823662)]
Image  202  Topics:  [(0, 0.5381761), (1, 0.46182388)]


Image  370  Topics:  [(0, 0.82190275), (1, 0.17809719)]
Image  371  Topics:  [(1, 0.64813036), (0, 0.35186964)]
Image  372  Topics:  [(0, 0.9979052)]
Image  373  Topics:  [(0, 0.793878), (1, 0.20612192)]
Image  374  Topics:  [(1, 0.63179266), (0, 0.36820725)]
Image  375  Topics:  [(0, 0.82634145), (1, 0.17365864)]
Image  376  Topics:  [(1, 0.99881065)]
Image  377  Topics:  [(1, 0.9984692)]
Image  378  Topics:  [(0, 0.7321578), (1, 0.26784223)]
Image  379  Topics:  [(0, 0.9984366)]
Image  380  Topics:  [(0, 0.64975286), (1, 0.35024717)]
Image  381  Topics:  [(1, 0.8279489), (0, 0.17205109)]
Image  382  Topics:  [(0, 0.98390687), (1, 0.01609313)]
Image  383  Topics:  [(1, 0.786175), (0, 0.21382494)]
Image  384  Topics:  [(1, 0.99855214)]
Image  385  Topics:  [(1, 0.99678713)]
Image  386  Topics:  [(1, 0.9979934)]
Image  387  Topics:  [(1, 0.6540099), (0, 0.3459901)]
Image  388  Topics:  [(0, 0.9852778), (1, 0.014722208)]
Image  389  Topics:  [(0, 0.9969937)]
Image  390  Topics:  [(0, 0.9

Image  558  Topics:  [(1, 0.9638709), (0, 0.03612913)]
Image  559  Topics:  [(0, 0.9989017)]
Image  560  Topics:  [(1, 0.99790144)]
Image  561  Topics:  [(1, 0.7352556), (0, 0.26474437)]
Image  562  Topics:  [(0, 0.8287161), (1, 0.17128386)]
Image  563  Topics:  [(1, 0.56194174), (0, 0.4380583)]
Image  564  Topics:  [(1, 0.6090351), (0, 0.39096487)]
Image  565  Topics:  [(1, 0.6185232), (0, 0.3814768)]
Image  566  Topics:  [(0, 0.99774444)]
Image  567  Topics:  [(1, 0.99809957)]
Image  568  Topics:  [(0, 0.98048276), (1, 0.019517297)]
Image  569  Topics:  [(1, 0.9987178)]
Image  570  Topics:  [(0, 0.99841887)]
Image  571  Topics:  [(1, 0.892793), (0, 0.10720703)]
Image  572  Topics:  [(1, 0.82018185), (0, 0.17981814)]
Image  573  Topics:  [(1, 0.8049584), (0, 0.19504166)]
Image  574  Topics:  [(0, 0.9986135)]
Image  575  Topics:  [(1, 0.9986372)]
Image  576  Topics:  [(1, 0.6670702), (0, 0.33292976)]
Image  577  Topics:  [(0, 0.9982007)]
Image  578  Topics:  [(1, 0.9988254)]
Image  579

Image  746  Topics:  [(1, 0.99786204)]
Image  747  Topics:  [(1, 0.790825), (0, 0.20917499)]
Image  748  Topics:  [(1, 0.6589205), (0, 0.34107944)]
Image  749  Topics:  [(0, 0.88552177), (1, 0.11447829)]
Image  750  Topics:  [(0, 0.9911774)]
Image  751  Topics:  [(0, 0.99877095)]
Image  752  Topics:  [(0, 0.6534695), (1, 0.34653047)]
Image  753  Topics:  [(1, 0.8809032), (0, 0.11909684)]
Image  754  Topics:  [(1, 0.9165315), (0, 0.08346852)]
Image  755  Topics:  [(1, 0.6256906), (0, 0.37430942)]
Image  756  Topics:  [(1, 0.6082323), (0, 0.39176765)]
Image  757  Topics:  [(1, 0.8346171), (0, 0.16538286)]
Image  758  Topics:  [(0, 0.99875265)]
Image  759  Topics:  [(1, 0.97740126), (0, 0.022598747)]
Image  760  Topics:  [(0, 0.8136181), (1, 0.18638186)]
Image  761  Topics:  [(0, 0.6817825), (1, 0.31821755)]
Image  762  Topics:  [(1, 0.67164963), (0, 0.32835037)]
Image  763  Topics:  [(1, 0.99773985)]
Image  764  Topics:  [(0, 0.8170412), (1, 0.18295878)]
Image  765  Topics:  [(0, 0.99355

Image  925  Topics:  [(1, 0.94238824), (0, 0.057611793)]
Image  926  Topics:  [(1, 0.983911), (0, 0.016088998)]
Image  927  Topics:  [(0, 0.5744091), (1, 0.42559087)]
Image  928  Topics:  [(1, 0.9112028), (0, 0.088797234)]
Image  929  Topics:  [(1, 0.63788325), (0, 0.3621167)]
Image  930  Topics:  [(1, 0.66653234), (0, 0.33346766)]
Image  931  Topics:  [(1, 0.9620071), (0, 0.037992895)]
Image  932  Topics:  [(0, 0.6982474), (1, 0.30175266)]
Image  933  Topics:  [(0, 0.82025087), (1, 0.17974909)]
Image  934  Topics:  [(1, 0.52236056), (0, 0.4776395)]
Image  935  Topics:  [(1, 0.9714193), (0, 0.028580772)]
Image  936  Topics:  [(0, 0.9973041)]
Image  937  Topics:  [(0, 0.59675413), (1, 0.4032459)]
Image  938  Topics:  [(0, 0.89990497), (1, 0.10009507)]
Image  939  Topics:  [(0, 0.9980991)]
Image  940  Topics:  [(0, 0.9858336), (1, 0.014166422)]
Image  941  Topics:  [(0, 0.6243449), (1, 0.37565506)]
Image  942  Topics:  [(0, 0.7874169), (1, 0.21258315)]
Image  943  Topics:  [(1, 0.9158837

Documentation:
https://radimrehurek.com/gensim/models/ldamodel.html
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
https://christop.club/2014/05/06/using-gensim-for-lda/
https://nlpforhackers.io/topic-modeling/