In [1]:
# Import packages
import numpy as np
import gensim
import nltk
import pprint 
from gensim import corpora
from nltk.tokenize import RegexpTokenizer

In [2]:
# Load data
X  = np.load('one_hot_coco.npy')

In [3]:
# Get data statistics, as a sanity check
print("Data shape: ", X.shape)
print("Number of 0s: ", np.sum(X == 1))
print("Number of 1s: ", np.sum(X == 0))
print("Anomailes: ",np.sum([X < 0]))

# Note: number of 1s is far more than 0s. Thus, there are many 'stopword' features. Look at ways to remove/normalize.

Data shape:  (118060, 4096)
Number of 0s:  66475570
Number of 1s:  417098190
Anomailes:  0


In [30]:
# Process the raw data
doc = []
bow = []
for i in range(1000): #X.shape[0]): # 118060
    doc_list = []
    bow_list = []
    for j in range(X.shape[1]): # 4096
        doc_list.append((j, X[i,j]))
        bow_list.append(X[i,j])
    doc.append(doc_list)
    bow.append(bow_list)

In [69]:
# Define parameters for topic modelling
num_topics = 2
num_words = 4096                                          # Number of top features to be displayed per topic

model_name = str(num_topics) + '-topics.model'
topics_file_name = 'topic_model_features_' + str(num_topics) +'_topics.npy'
topics_per_image_file_name = 'topics_per_image-' + str(num_topics) +'_topics.npy'

In [6]:
# Create the Topic Model
#ldamodel = gensim.models.ldamodel.LdaModel(doc, num_topics = num_topics, passes=15)
#ldamodel.save(model_name)

# Or load the saved model
ldamodel = gensim.models.ldamodel.LdaModel.load("../2 topics/"+model_name)

In [87]:
# Raw output
topic_distribution = ldamodel.print_topics(num_words=num_words)
#print (topic_distribution)

In [88]:
'''Save distribution of features for every topic'''

# Extract feature ids per topic from raw output
topics = np.zeros([num_topics, num_words])
tokenizer = RegexpTokenizer(r'\w+')

for topic_ids in topic_distribution:
    topic_id = topic_ids[0]
    all_features = topic_ids[1]

    feature_values = tokenizer.tokenize(all_features)     # Tokenize the string to keep all numbers
    feature_values = list(map(int, feature_values))       # Convert values of list to int
    feature_values = np.asarray(feature_values[2::3]) * np.asarray(feature_values[1::3])   # Remove all features with 0 weightage in topic
    
    topics[topic_id] = feature_values
        
# Save feature values in npy file
np.save(topics_file_name, topics)

#topics is a numpy array with one row representing one topic. The columns contain the indices of the features belonging to that topic, in decreasing order of how strongly linked a feature is to that topic. The vectors have been padded with 0s in the end.

In [70]:
'''Save distribution of topics, for every image'''

topics_per_image = ldamodel[doc]
topics_per_image_matrix = np.zeros([len(topic_dist),k,2])
print ("No of documents: ", len(topic_dist))

# Save top k topics per image to file
k = 5

i = 0
for image_topics in topics_per_image:
    image_topics.sort(key=operator.itemgetter(1), reverse=True)
    print ("Image ",i, " Topics: ", image_topics)
    for j in range(min(k,len(image_topics))):
        topics_per_image_matrix[i][j][0] = image_topics[j][0]    # Store topic id
        topics_per_image_matrix[i][j][1] = image_topics[j][1]    # Store probability of document having that topic
    i = i+1
    
# Save feature values in npy file
np.save(topics_per_image_file_name, topics_per_image_matrix)

No of documents:  1000
Image  0  Topics:  [(1, 0.5156155), (0, 0.48438454)]
Image  1  Topics:  [(0, 0.99793255)]
Image  2  Topics:  [(0, 0.6317237), (1, 0.3682763)]
Image  3  Topics:  [(0, 0.93127567), (1, 0.06872433)]
Image  4  Topics:  [(0, 0.746299), (1, 0.25370094)]
Image  5  Topics:  [(1, 0.9141684), (0, 0.0858316)]
Image  6  Topics:  [(1, 0.78649354), (0, 0.21350648)]
Image  7  Topics:  [(1, 0.6287621), (0, 0.37123784)]
Image  8  Topics:  [(0, 0.6389383), (1, 0.36106166)]
Image  9  Topics:  [(0, 0.9978107)]
Image  10  Topics:  [(0, 0.9982269)]
Image  11  Topics:  [(1, 0.5262393), (0, 0.47376075)]
Image  12  Topics:  [(1, 0.8447614), (0, 0.15523866)]
Image  13  Topics:  [(1, 0.97627985), (0, 0.023720127)]
Image  14  Topics:  [(0, 0.7247112), (1, 0.27528882)]
Image  15  Topics:  [(1, 0.5610695), (0, 0.4389305)]
Image  16  Topics:  [(0, 0.9957506)]
Image  17  Topics:  [(1, 0.7789423), (0, 0.22105776)]
Image  18  Topics:  [(1, 0.8150366), (0, 0.18496343)]
Image  19  Topics:  [(0, 0.6

Image  174  Topics:  [(0, 0.99837327)]
Image  175  Topics:  [(1, 0.99702287)]
Image  176  Topics:  [(1, 0.8525599), (0, 0.14744006)]
Image  177  Topics:  [(0, 0.9972496)]
Image  178  Topics:  [(0, 0.6574189), (1, 0.34258115)]
Image  179  Topics:  [(0, 0.99856955)]
Image  180  Topics:  [(1, 0.99805623)]
Image  181  Topics:  [(0, 0.84358937), (1, 0.15641068)]
Image  182  Topics:  [(0, 0.7933176), (1, 0.20668237)]
Image  183  Topics:  [(1, 0.64655757), (0, 0.35344246)]
Image  184  Topics:  [(1, 0.9983341)]
Image  185  Topics:  [(1, 0.5403618), (0, 0.45963818)]
Image  186  Topics:  [(0, 0.84755355), (1, 0.15244643)]
Image  187  Topics:  [(0, 0.69195473), (1, 0.30804527)]
Image  188  Topics:  [(0, 0.92391133), (1, 0.0760887)]
Image  189  Topics:  [(0, 0.7445392), (1, 0.25546077)]
Image  190  Topics:  [(0, 0.8137625), (1, 0.18623753)]
Image  191  Topics:  [(1, 0.86138254), (0, 0.13861744)]
Image  192  Topics:  [(0, 0.90678656), (1, 0.09321347)]
Image  193  Topics:  [(0, 0.6504808), (1, 0.349

Image  353  Topics:  [(0, 0.9959341)]
Image  354  Topics:  [(1, 0.79174477), (0, 0.20825525)]
Image  355  Topics:  [(0, 0.8367106), (1, 0.16328941)]
Image  356  Topics:  [(0, 0.5296159), (1, 0.4703841)]
Image  357  Topics:  [(1, 0.8033114), (0, 0.19668858)]
Image  358  Topics:  [(1, 0.9231068), (0, 0.07689323)]
Image  359  Topics:  [(1, 0.89742225), (0, 0.10257775)]
Image  360  Topics:  [(0, 0.9979248)]
Image  361  Topics:  [(0, 0.9940129)]
Image  362  Topics:  [(0, 0.8634076), (1, 0.1365924)]
Image  363  Topics:  [(1, 0.7958125), (0, 0.20418754)]
Image  364  Topics:  [(0, 0.92653954), (1, 0.07346051)]
Image  365  Topics:  [(0, 0.7305045), (1, 0.26949543)]
Image  366  Topics:  [(1, 0.97954404), (0, 0.020455929)]
Image  367  Topics:  [(1, 0.67024714), (0, 0.32975286)]
Image  368  Topics:  [(0, 0.79299664), (1, 0.20700334)]
Image  369  Topics:  [(1, 0.7295787), (0, 0.2704214)]
Image  370  Topics:  [(0, 0.8218787), (1, 0.17812137)]
Image  371  Topics:  [(1, 0.6481374), (0, 0.35186264)]
Im

Image  525  Topics:  [(0, 0.8665038), (1, 0.13349618)]
Image  526  Topics:  [(0, 0.580312), (1, 0.41968796)]
Image  527  Topics:  [(0, 0.6418352), (1, 0.35816473)]
Image  528  Topics:  [(1, 0.9986799)]
Image  529  Topics:  [(1, 0.7284599), (0, 0.27154005)]
Image  530  Topics:  [(0, 0.84516126), (1, 0.15483871)]
Image  531  Topics:  [(1, 0.5998691), (0, 0.40013096)]
Image  532  Topics:  [(1, 0.9176182), (0, 0.08238179)]
Image  533  Topics:  [(0, 0.979976), (1, 0.020024002)]
Image  534  Topics:  [(0, 0.9782241), (1, 0.02177584)]
Image  535  Topics:  [(0, 0.9977819)]
Image  536  Topics:  [(0, 0.82775074), (1, 0.17224924)]
Image  537  Topics:  [(0, 0.9986515)]
Image  538  Topics:  [(1, 0.97812617), (0, 0.021873876)]
Image  539  Topics:  [(0, 0.8958099), (1, 0.10419006)]
Image  540  Topics:  [(0, 0.7151676), (1, 0.28483248)]
Image  541  Topics:  [(0, 0.78342235), (1, 0.21657766)]
Image  542  Topics:  [(1, 0.5902584), (0, 0.40974167)]
Image  543  Topics:  [(0, 0.99860394)]
Image  544  Topics

Image  734  Topics:  [(1, 0.9963509)]
Image  735  Topics:  [(0, 0.6698793), (1, 0.33012068)]
Image  736  Topics:  [(1, 0.8480961), (0, 0.15190393)]
Image  737  Topics:  [(1, 0.9536382), (0, 0.04636173)]
Image  738  Topics:  [(1, 0.61146426), (0, 0.38853574)]
Image  739  Topics:  [(0, 0.88347554), (1, 0.11652453)]
Image  740  Topics:  [(1, 0.57648337), (0, 0.42351663)]
Image  741  Topics:  [(1, 0.82369167), (0, 0.17630832)]
Image  742  Topics:  [(0, 0.9985181)]
Image  743  Topics:  [(1, 0.5034277), (0, 0.49657226)]
Image  744  Topics:  [(1, 0.85824466), (0, 0.14175536)]
Image  745  Topics:  [(1, 0.8688584), (0, 0.13114166)]
Image  746  Topics:  [(1, 0.99785125)]
Image  747  Topics:  [(1, 0.790849), (0, 0.20915104)]
Image  748  Topics:  [(1, 0.6589139), (0, 0.3410861)]
Image  749  Topics:  [(0, 0.8855658), (1, 0.11443423)]
Image  750  Topics:  [(0, 0.991422)]
Image  751  Topics:  [(0, 0.9987715)]
Image  752  Topics:  [(0, 0.65347004), (1, 0.34653002)]
Image  753  Topics:  [(1, 0.8809765)

Image  909  Topics:  [(1, 0.58130085), (0, 0.4186992)]
Image  910  Topics:  [(1, 0.8193254), (0, 0.18067463)]
Image  911  Topics:  [(1, 0.9359903), (0, 0.06400971)]
Image  912  Topics:  [(0, 0.99281955)]
Image  913  Topics:  [(0, 0.718092), (1, 0.281908)]
Image  914  Topics:  [(0, 0.7815029), (1, 0.21849711)]
Image  915  Topics:  [(0, 0.7939109), (1, 0.20608905)]
Image  916  Topics:  [(0, 0.66324556), (1, 0.33675438)]
Image  917  Topics:  [(0, 0.5614938), (1, 0.43850616)]
Image  918  Topics:  [(1, 0.9934898)]
Image  919  Topics:  [(1, 0.9983564)]
Image  920  Topics:  [(1, 0.6852308), (0, 0.3147692)]
Image  921  Topics:  [(1, 0.9498327), (0, 0.050167322)]
Image  922  Topics:  [(0, 0.6647299), (1, 0.3352701)]
Image  923  Topics:  [(0, 0.99813825)]
Image  924  Topics:  [(0, 0.92668015), (1, 0.07331986)]
Image  925  Topics:  [(1, 0.9426043), (0, 0.05739565)]
Image  926  Topics:  [(1, 0.9838031), (0, 0.01619693)]
Image  927  Topics:  [(0, 0.5744087), (1, 0.42559132)]
Image  928  Topics:  [(

Documentation:
https://radimrehurek.com/gensim/models/ldamodel.html
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
https://christop.club/2014/05/06/using-gensim-for-lda/
https://nlpforhackers.io/topic-modeling/