In [1]:
%load_ext autoreload
%autoreload 2

### Additional Requirements

If you wish to run this entire notebook, you need to install OCTIS

~~~
pip install octis
~~~

In [2]:
import os

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = str(0)

import pickle
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori

#### Train the model and get topic-word, topic-document distribution

#### For this notebook, we use OCTIS to train, you may substitute it with any NTMof your choosing

In [3]:
from octis.dataset.dataset import Dataset
from octis.models.ProdLDA import ProdLDA

dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")
# data_corpus = [' '.join(i) for i in dataset.get_corpus()]
K = 20
model = ProdLDA(num_topics=K)  
model_output = model.train_model(dataset) 



Epoch: [1/100]	Samples: [11415/1141500]	Train Loss: 381.6961956410151	Time: 0:00:01.298062
Epoch: [1/100]	Samples: [2447/244700]	Validation Loss: 340.9318409659787	Time: 0:00:00.096666
Epoch: [2/100]	Samples: [22830/1141500]	Train Loss: 360.61556879311763	Time: 0:00:00.795674
Epoch: [2/100]	Samples: [2447/244700]	Validation Loss: 335.76189524561323	Time: 0:00:00.052756
Epoch: [3/100]	Samples: [34245/1141500]	Train Loss: 352.9748807421704	Time: 0:00:00.820251
Epoch: [3/100]	Samples: [2447/244700]	Validation Loss: 330.5557247235135	Time: 0:00:00.097542
Epoch: [4/100]	Samples: [45660/1141500]	Train Loss: 348.878914805081	Time: 0:00:00.561746
Epoch: [4/100]	Samples: [2447/244700]	Validation Loss: 329.59659069140787	Time: 0:00:00.070083
Epoch: [5/100]	Samples: [57075/1141500]	Train Loss: 345.8888331382775	Time: 0:00:00.872701
Epoch: [5/100]	Samples: [2447/244700]	Validation Loss: 326.6838496636506	Time: 0:00:00.053087
Epoch: [6/100]	Samples: [68490/1141500]	Train Loss: 344.0899976559078	Tim

#### Notice that the beta (topic-word) has negative activations and has a shape of K x V

In [4]:
print(model_output['topic-word-matrix'].shape)
print(model_output['topic-word-matrix'])

(20, 1612)
[[ 0.25676677  0.21063197  0.38287085 ...  0.2081572   0.19192426
   0.29204318]
 [-0.11582982 -0.1779684  -0.40677768 ... -0.46703288 -0.29751846
  -0.2726097 ]
 [-0.01092505  0.15014514  0.01552751 ... -0.35248208 -0.11071629
  -0.2600993 ]
 ...
 [-0.17565031 -0.3313362  -0.05640171 ...  0.0084874   0.10717141
  -0.13018116]
 [ 0.40472564  0.23212644  0.3644143  ...  0.2615479   0.1197729
   0.32827532]
 [-0.14924964 -0.13024591 -0.10577638 ... -0.22410616 -0.4211426
  -0.10390038]]


### Mining Step

In [5]:
beta = model_output['topic-word-matrix'].T
data = model_output['topic-document-matrix'].T
mat = np.sort(data, axis=1).mean(axis=0)
assert len(mat) == K

# Find a suitable threshold kappa hyper-parameter (currently using 5th largest mean activation)
threshold = mat[-5]
print('threshold', threshold)

reduced_data = np.zeros_like(beta)
for i,j in np.argwhere(beta > threshold):
    reduced_data[i,j] = 1

reduced_data = pd.DataFrame(reduced_data)

min_s = 0.01
frequent_itemsets = apriori(reduced_data, 
                            min_support = min_s, 
                            max_len = 5, 
                            use_colnames = True,
                           verbose = 1)

topic_combinations = [list(a) for a in frequent_itemsets['itemsets']]
print("topics generated @ s =", min_s, ":", len(topic_combinations))

threshold 0.05809011665833185
Processing 380 combinations | Sampling itemset size 2Processing 1746 combinations | Sampling itemset size 3Processing 4344 combinations | Sampling itemset size 4Processing 6125 combinations | Sampling itemset size 5
topics generated @ s = 0.01 : 1304


### Get possible compositions ($\beta^{V x K}$)

In [6]:
from algo.normal import calculate_compositions

output, topic_combinations = calculate_compositions(beta, topic_combinations, add_pairs=True)

### Get top-$l$ vocab ($l = 10$)

In [7]:
topics = np.argpartition(np.array(output),-10)[:,-10:]
vocab = dataset.get_vocabulary()
topics = [[vocab[idx] for idx in topic] for topic in topics]

### Calculate scores/fitness (estimation done using gensim)

In [8]:
from octis.evaluation_metrics.coherence_metrics import Coherence
from gensim.models import CoherenceModel

coherence = Coherence(texts=dataset.get_corpus(), measure = 'c_npmi')
c = CoherenceModel(topics=topics, texts=coherence._texts, dictionary=coherence._dictionary,
                                      coherence='c_npmi', processes=4, topn=coherence.topk)
total_scores = c.get_coherence_per_topic()

In [9]:
from algo.normal import count_unique_words

# for original
print('NPMI:', np.mean(total_scores[:K]))
print('TU:', count_unique_words(topics[:K]) / (K*10))

NPMI: 0.03922319988606042
TU: 0.88


### Optimization Step

In [10]:
from algo.gp_based import mdkp

choices = mdkp(topics, total_scores, K, 0.935*K*10, range(K), MIP_gap=0.01, time_limit=3600)

Set parameter TokenServer to value "nas.nodes.preferred.ai"
Set parameter MIPGap to value 0.01
Set parameter TimeLimit to value 3600
Gurobi Optimizer version 9.5.1 build v9.5.1rc2 (linux64)
Thread count: 64 physical cores, 128 logical processors, using up to 32 threads
Optimize a model with 633 rows, 2014 columns and 16258 nonzeros
Model fingerprint: 0x7e577084
Variable types: 0 continuous, 2014 integer (2014 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  Objective range  [1e-04, 5e-01]
  Bounds range     [1e+00, 1e+00]
  RHS range        [2e+01, 2e+02]

User MIP start did not produce a new incumbent solution

Presolve removed 0 rows and 45 columns
Presolve time: 0.02s
Presolved: 633 rows, 1969 columns, 15763 nonzeros
Variable types: 0 continuous, 1969 integer (1939 binary)

Root relaxation: objective 2.265226e+00, 959 iterations, 0.01 seconds (0.02 work units)
Another try with MIP start

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Exp

In [11]:
print('NPMI:', np.mean(np.array(total_scores)[choices]))
print('TU:', count_unique_words(np.array(topics)[choices]) / (K*10))

NPMI: 0.11020904937895741
TU: 0.96


#### Displaying composite topics $\hat{T}$

Composite topics shown with its components indented after

In [12]:
for i in range(len(choices)):
    if choices[i]:
        print("{0: .3f}".format(total_scores[i]), " ".join(topics[i]))
        if i < K:
            print('\t> original component topic')
            continue
        for c in topic_combinations[i]:
            print("\t>{0: .3f}".format(total_scores[c]), " ".join(topics[c]))

 0.197 project flight element orbit launch mission satellite space shuttle system
	> original component topic
 0.099 universe church scripture truth faith religion sin atheist belief atheism
	> original component topic
 0.202 internet key clipper phone security technology encryption secure communication privacy
	> original component topic
 0.190 keyboard serial apple mouse video board monitor card modem port
	> original component topic
 0.144 tire wheel brake cover buy bike car bag oil engine
	> original component topic
 0.264 star moon fly degree earth orbit mission planet solar surface
	> 0.126 ranger devil planet shot game blue fan red playoff cap
	> 0.197 project flight element orbit launch mission satellite space shuttle system
 0.289 baseball fan hockey play season win score team playoff game
	> 0.126 ranger devil planet shot game blue fan red playoff cap
	> 0.195 play team point game player score league good year season
 0.064 law constitution country innocent weapon civilian st

#### Original topics that are excluded

Notice that some "bad" topics are components of "new" composite topics

In [13]:
for i in range(K):
    if choices[i]:
        continue
    print("{0: .3f}".format(total_scores[i]), " ".join(topics[i]))

-0.325 portion reserve installation success prior estimate training extend frequently assure
 0.096 function string application manager app event font window widget block
 0.226 floppy controller ide hard disk crash boot problem scsi drive
 0.195 play team point game player score league good year season
 0.147 greek israeli russian armenian village jewish army people population turkish
 0.083 run version window binary make site include widget server file
-0.247 drink quickly reduce complaint virtually achieve hole assure cycle frequently
-0.244 extend remark thought hole assure delete face success unique chapter
-0.305 range lab unique frequently complaint estimate dollar portion assure mine
 0.071 drug increase fund president health medical year make disease patient
 0.134 ship offer interested email reply advance mailing sale mail address
 0.173 fire law amendment assault batf firearm weapon violent gun crime
 0.126 ranger devil planet shot game blue fan red playoff cap
 0.124 soldie