In [1]:
%load_ext autoreload
%autoreload 2

### Additional Requirements

If you wish to run this entire notebook, you need to install OCTIS

~~~
pip install octis
~~~

In [2]:
import os

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = str(0)

import pickle
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori

#### Train the model and get topic-word, topic-document distribution

#### For this notebook, we use OCTIS to train, you may substitute it with any NTMof your choosing

In [3]:
from octis.dataset.dataset import Dataset
from octis.models.ProdLDA import ProdLDA

dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")
# data_corpus = [' '.join(i) for i in dataset.get_corpus()]
K = 20
model = ProdLDA(num_topics=K)  
model_output = model.train_model(dataset) 

  from .autonotebook import tqdm as notebook_tqdm


Epoch: [1/100]	Samples: [11415/1141500]	Train Loss: 382.95891251231933	Time: 0:00:01.207505
Epoch: [1/100]	Samples: [2447/244700]	Validation Loss: 338.8774008990601	Time: 0:00:00.063604
Epoch: [2/100]	Samples: [22830/1141500]	Train Loss: 360.0117293583005	Time: 0:00:00.672641
Epoch: [2/100]	Samples: [2447/244700]	Validation Loss: 336.9096943084006	Time: 0:00:00.062449
Epoch: [3/100]	Samples: [34245/1141500]	Train Loss: 355.4035380820877	Time: 0:00:00.656063
Epoch: [3/100]	Samples: [2447/244700]	Validation Loss: 329.9719924708508	Time: 0:00:00.062872
Epoch: [4/100]	Samples: [45660/1141500]	Train Loss: 350.0400070664969	Time: 0:00:00.655187
Epoch: [4/100]	Samples: [2447/244700]	Validation Loss: 328.23213453335717	Time: 0:00:00.062800
Epoch: [5/100]	Samples: [57075/1141500]	Train Loss: 347.6271623822821	Time: 0:00:00.656386
Epoch: [5/100]	Samples: [2447/244700]	Validation Loss: 423.54268140835717	Time: 0:00:00.062621
Epoch: [6/100]	Samples: [68490/1141500]	Train Loss: 343.9628962285781	Ti

Epoch: [45/100]	Samples: [513675/1141500]	Train Loss: 332.6589108440785	Time: 0:00:00.665572
Epoch: [45/100]	Samples: [2447/244700]	Validation Loss: 314.9151380357325	Time: 0:00:00.063798
Epoch: [46/100]	Samples: [525090/1141500]	Train Loss: 332.70700815128123	Time: 0:00:00.664554
Epoch: [46/100]	Samples: [2447/244700]	Validation Loss: 314.4691105738532	Time: 0:00:00.063970
Early stopping


#### Notice that the beta (topic-word) has negative activations and has a shape of K x V

In [4]:
print(model_output['topic-word-matrix'].shape)
print(model_output['topic-word-matrix'])

(20, 1612)
[[-0.13062593 -0.08251245  0.00296721 ... -0.0295416  -0.34053457
  -0.03513475]
 [ 0.02646185 -0.17787448 -0.16627383 ... -0.21845856 -0.5283106
  -0.2165347 ]
 [ 0.03994723  0.18910125  0.04728971 ... -0.2283454  -0.3732265
  -0.30879128]
 ...
 [ 0.18177032  0.00865875  0.05434528 ...  0.06564841  0.10349061
   0.22130793]
 [-0.0826124  -0.18028025  0.05177277 ...  0.20207533 -0.06025597
  -0.0767033 ]
 [-0.22579312 -0.27452523 -0.24259743 ... -0.386449   -0.24701835
  -0.527656  ]]


### Mining Step

In [5]:
beta = model_output['topic-word-matrix'].T
data = model_output['topic-document-matrix'].T
mat = np.sort(data, axis=1).mean(axis=0)
assert len(mat) == K

# Find a suitable threshold kappa hyper-parameter (currently using 5th largest mean activation)
threshold = mat[-5]
print('threshold', threshold)
    
reduced_data = np.zeros((data.shape[0],data.shape[1]))
for i,j in np.argwhere(data > threshold):
    reduced_data[i,j] = 1

reduced_data = pd.DataFrame(reduced_data)

min_s = 0.01
frequent_itemsets = apriori(reduced_data, 
                            min_support = min_s, 
                            max_len = 5, 
                            use_colnames = True,
                           verbose = 1)

topic_combinations = [list(a) for a in frequent_itemsets['itemsets']]
print("topics generated @ s =", min_s, ":", len(topic_combinations))

threshold 0.05591654907795865
Processing 8416 combinations | Sampling itemset size 4



Processing 2105 combinations | Sampling itemset size 5
topics generated @ s = 0.01 : 822


### Get possible compositions ($\beta^{V x K}$)

In [6]:
from algo.normal import calculate_compositions

output, topic_combinations = calculate_compositions(beta, topic_combinations, add_pairs=True)

### Get top-$l$ vocab ($l = 10$)

In [7]:
topics = np.argpartition(np.array(output),-10)[:,-10:]
vocab = dataset.get_vocabulary()
topics = [[vocab[idx] for idx in topic] for topic in topics]

### Calculate scores/fitness (estimation done using gensim)

In [8]:
from octis.evaluation_metrics.coherence_metrics import Coherence
from gensim.models import CoherenceModel

coherence = Coherence(texts=dataset.get_corpus(), measure = 'c_npmi')
c = CoherenceModel(topics=topics, texts=coherence._texts, dictionary=coherence._dictionary,
                                      coherence='c_npmi', processes=4, topn=coherence.topk)
total_scores = c.get_coherence_per_topic()

  from scipy.linalg.special_matrices import triu


In [9]:
from algo.normal import count_unique_words

# for original
print('NPMI:', np.mean(total_scores[:K]))
print('TU:', count_unique_words(topics[:K]) / (K*10))

NPMI: 0.07956555862542489
TU: 0.92


### Optimization Step

In [14]:
from algo.cvxpy_based import mdkp
import cvxpy as cp

choices = mdkp(topics, total_scores, K, 0.935*K*10, solver = cp.SCIP)

                                     CVXPY                                     
                                     v1.3.0                                    
(CVXPY) Oct 28 09:06:12 PM: Your problem has 1647 variables, 827 constraints, and 0 parameters.
(CVXPY) Oct 28 09:06:12 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Oct 28 09:06:12 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 28 09:06:12 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Oct 28 09:06:12 PM: Compiling problem (target solver=SCIP).
(CVXPY) Oct 28 09:06:12 PM: Reduction chain: FlipObjective -> Dcp2Cone -> CvxAttr2Constr -> 

In [16]:
print('NPMI:', np.mean(np.array(total_scores)[choices]))
print('TU:', count_unique_words(np.array(topics)[choices]) / (K*10))

NPMI: 0.16129195596333842
TU: 0.97


#### Displaying composite topics $\hat{T}$

Composite topics shown with its components indented after

In [17]:
for i in range(len(choices)):
    if choices[i]:
        print("{0: .3f}".format(total_scores[i]), " ".join(topics[i]))
        if i < K:
            print('\t> original component topic')
            continue
        for c in topic_combinations[i]:
            print("\t>{0: .3f}".format(total_scores[c]), " ".join(topics[c]))

 0.229 government enforcement phone secure clipper key encryption encrypt secret chip
	> original component topic
 0.250 greek village massacre population muslim russian genocide turkish people armenian
	> original component topic
 0.234 bus card hard controller driver ide drive disk scsi problem
	> original component topic
 0.282 win league goal playoff season game play player score team
	> original component topic
 0.160 heat fuel temperature ground water power engine battery air mile
	> 0.195 power front oil ride brake car engine bike battery tire
	> 0.315 shuttle moon earth satellite solar planet mission orbit surface launch
 0.088 claim atheism absolute truth evidence word belief make point thing
	> 0.114 atheist argument point moral belief science prove atheism evidence theory
	> 0.146 eternal relationship church verse scripture man sin homosexual word teach
 0.167 tie red wing ranger ice star year team player series
	> 0.282 win league goal playoff season game play player score 

#### Original topics that are excluded

Notice that some "bad" topics are components of "new" composite topics

In [18]:
for i in range(K):
    if choices[i]:
        continue
    print("{0: .3f}".format(total_scores[i]), " ".join(topics[i]))

 0.195 power front oil ride brake car engine bike battery tire
 0.114 atheist argument point moral belief science prove atheism evidence theory
 0.315 shuttle moon earth satellite solar planet mission orbit surface launch
 0.073 people food good insurance year drug disease patient health medical
 0.146 eternal relationship church verse scripture man sin homosexual word teach
 0.134 condition offer ship apple port shipping sale monitor price video
-0.285 previously length huge requirement suit white powerful examine limited installation
 0.170 illegal gas police crime gun fire assault compound weapon batf
 0.043 floor sit start neighbor didn good time thing make people
 0.178 list network mail newsgroup address email internet mailing domain host
-0.268 delay installation status examine task fill possibly requirement critical presence
 0.166 button application screen manager font run problem window widget motif
-0.249 rule portion chapter explain freedom relation remark virtually languag