### Contextualized topic model quick example

In [9]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
import numpy as np


def get_coherence(model_keywords):
    keywords = []
    for k, v in model_keywords.items():
            keywords.append(v)
    dictionary = Dictionary(processed_data)

    coherence_model = CoherenceModel(
            topics=keywords,
            texts=processed_data,
            dictionary=dictionary,
            coherence='u_mass'
            )

    coherence_score = coherence_model.get_coherence()

    return coherence_score

In [11]:
import pandas as pd
import pickle
from Topic_Models.Neural_Topic_Model import Neural_Model
doc_dir = './Topic_Models/Data/congressional_bill_train.json'
processed_doc_dir = './Topic_Models/Data/congressional_bill_train_processed.pkl'

'''
Mapping the mode numbers to which model we use
LA: active learning baseline
'''
model_types_map = {0: 'LA' , 1: 'LDA', 2: 'SLDA', 3: 'ETM'}
num_iter = 1500
load_data = True
save_model = False

'''
Enter the number of topics for the model you just trained
'''
num_topics =30
inference_alg = 'logreg'
test_dataset_name = './Topic_Models/Data/congressional_bill_test.json'
test_processed_doc_dir = './Topic_Models/Data/congressional_bill_test_processed.pkl'

'''
Keep those and don't change
'''
USE_TEST_DATA = True
USE_PROCESSED_TEXT = False
CONCATENATE_KEYWORDS = True
table = pd.read_json(doc_dir)
training_length = len(table)
REGRESSOR_PREDICT = True
mode = 1
labels = table.sub_labels.values.tolist()


table = pd.read_json(doc_dir)
training_length = len(table)
list_of_unpreprocessed_documents = table.text.values.tolist()

with open(processed_doc_dir, 'rb') as inp:
    loaded_data = pickle.load(inp)
    processed_data = loaded_data['datawords_nonstop']
    list_of_preprocessed_documents = [' '.join(ele) for ele in processed_data]

test_table = pd.read_json(test_dataset_name)
list_of_unpreprocessed_test_documents = test_table.text.values.tolist()
with open(test_processed_doc_dir, 'rb') as inp:
    loaded_test_data = pickle.load(inp)
    processed_test_data = loaded_test_data['datawords_nonstop']
    list_of_preprocessed_test_documents = [' '.join(ele) for ele in processed_test_data]

### sLDA prediction test with preprocessed words

In [12]:
'''
Given a training documents, and a testing documents, return the model prediciton by slda
'''
def slda_predict(train_docs, test_docs, model):
    '''
    train docs, test docs: a corpus. Each document is a list of tokens
    model: a trained slda model
    '''
    counter = {}
    predictive_labels = []
    test_predictive_labels = []
    for i in range(len(train_docs)):
        doc_nist = model.make_doc(train_docs[i])
        
        model.infer(doc_nist)
        max_response = model.estimate(doc_nist).argmax()
        predictive_labels.append(max_response)
        try:
            test_inst = model.make_doc(test_docs[i])
            model.infer(test_inst)
            max_test_response = model.estimate(test_inst).argmax()
            test_predictive_labels.append(max_test_response)
        except: 
            pass
        # test_inst = Model.model.make_doc(Model.data_words_nonstop[i])
        if max_response in counter:
            counter[max_response] += 1
        else:
            counter[max_response] = 1

    return counter, predictive_labels, test_predictive_labels

In [13]:
'''
Given a list of text labels, convert them to integers as inputs of response variables for model testing
'''
def convert_labels_to_numbers(label_list):
    label_mapping = {}
    next_label_number = 0
    numeric_labels = []

    for label in label_list:
        if label not in label_mapping:
            label_mapping[label] = next_label_number
            next_label_number += 1
        numeric_labels.append(label_mapping[label])

    return numeric_labels, label_mapping

In [14]:
'''
Convert the training set labels to integers and get a map from text to integers
'''
texts = table.text.values.tolist()
sub_labels = table.sub_labels.values.tolist()

'''
Here, we pretend the user labels all the training set, we want to run sLDA
first with the existing user labels, then test the prediction accuracy directly
from sLDA with the training and testing set
'''
user_labels = {i: sub_labels[i] for i in range(len(texts))}

user_labels = dict(sorted(user_labels.items()))
user_label_list = list(user_labels.values())

user_label_map, mappings = convert_labels_to_numbers(user_label_list)
user_label_map[:10]

[0, 1, 2, 3, 0, 4, 5, 6, 7, 8]

In [15]:
len(user_labels)

8607

In [16]:
'''
Also extract the preprocessed test data and use the label-integer mapping to map test texst labels to integers
'''
# test_table = pd.read_json(test_dataset_name)
# test_texts = test_table.text.values.tolist()
# test_sub_labels = test_table.sub_labels.values.tolist()

# with open('./Topic_Models/Data/congressional_bill_train_processed.pkl', 'rb') as inp:
#     loaded_data = pickle.load(inp)
#     processed_test_data = loaded_data['datawords_nonstop']

# test_label_map = [mappings[ele] for ele in test_sub_labels]


'\nAlso extract the preprocessed test data and use the label-integer mapping to map test texst labels to integers\n'

In [10]:
'''View the first ten mapped test labels'''
test_label_map[30:40]

[0, 15, 14, 7, 0, 4, 0, 2, 15, 2]

In [17]:
'''
Train the sLDA using the training set, assuming sLDA has access to all labels in the training set
'''
from Topic_Models.topic_model import Topic_Model
Model = Topic_Model(30, 1500, 'SLDA', './Topic_Models/Data/congressional_bill_train_processed.pkl', training_length , user_labels, False, None)
Model.train('./Topic_Models/Model/SLDA_30.pkl')

num topics: 30
Created SLDA model
total user labels are 8607
starting training...
Iteration: 0, Log-likelihood: -8.756933018416754, Perplexity: 6354.592243192826, cv coherence: 0.33639948398298614
Iteration: 300, Log-likelihood: -8.425364725686173, Perplexity: 4561.308101278413, cv coherence: 0.4180431772532487
Iteration: 600, Log-likelihood: -8.44556107815405, Perplexity: 4654.366443819756, cv coherence: 0.42368575203947256
Iteration: 900, Log-likelihood: -8.307986744323031, Perplexity: 4056.138713726887, cv coherence: 0.4218146943974183


In [27]:
counter, predictive_label, test_predictive_label = slda_predict(processed_data, processed_test_data, Model.model)

In [30]:
from sklearn.metrics import accuracy_score

'''
Training accuracy for sLDA
'''
accuracy_score(user_label_map, predictive_label)

0.5683773074321059

In [31]:
'''
sLDA testing accuracy
'''
accuracy_score(test_label_map, test_predictive_label)

0.4348134487161299

In [16]:
'''
Check whether regressor coefficient is good
'''
Model.model.get_regression_coef()

array([[-7.1717715e+00, -9.4333820e+00, -5.7512579e+00, -3.3249002e+00,
        -4.0262671e+00, -5.5705385e+00, -3.9009543e+02,  2.2824147e+00,
        -1.5841526e+00, -6.6370311e+00, -8.2382861e+03, -1.0060060e+02,
        -4.3441943e+02, -3.1912555e+02, -1.5520414e+00, -2.9579976e+00],
       [-8.4064159e+00, -2.4118944e+02, -6.0421720e+00, -3.7932055e+00,
        -4.1082692e+00, -2.1879861e+02, -6.1473203e+00, -6.4143572e+00,
        -1.9729582e+00,  7.3525006e-01, -3.7234268e-01, -5.3845730e+00,
        -3.5734338e+02, -2.8559399e+00, -7.1398320e+00, -1.5399799e+00],
       [-5.8458986e+00, -4.7016916e+02, -7.7924094e+00, -3.0099306e+00,
        -5.2282624e+00, -5.6554224e+02,  3.1108414e+01, -3.9984808e+02,
        -1.2367842e+00, -1.3726552e+01, -4.4619560e+00, -5.4000775e+02,
        -3.6306488e+01, -3.6162691e+00, -5.0478735e+00, -2.1048298e+00],
       [-7.1053696e+00,  2.1463418e+00,  4.7284150e-01, -3.2197487e+00,
        -1.3858293e+02, -1.1481842e+02, -5.1952405e+00, -7.66

In [17]:
'''
Check whether there are some difference in the regression coefficient sum
'''
for i in range(len(Model.model.get_regression_coef())):
    print(i, sum(Model.model.get_regression_coef()[i]))

0 -9528.254070520401
1 -870.773551851511
2 -2032.8360702991486
3 -3164.0942156910896
4 -255.13821169734
5 -2520.43114900589
6 -3246.4967193603516
7 -11112.864233613014
8 -1313.1837648749352
9 -1095.5500347316265
10 -10517.410782694817
11 -4510.059874117374
12 -6590.766949862242
13 -1636.322233557701
14 -860.9017847403884
15 -628.0747654438019
16 -9393.248667001724
17 -10175.824758648872
18 -9985.523555397987
19 -2281.8317324221134


In [13]:
Model.model.get_regression_coef(var_id=3)

array([-5.96283245e+00, -4.84228516e+00, -4.65203619e+00, -5.81537437e+00,
       -3.68868047e+04,  1.20726105e+02, -2.75120926e+00, -4.30867052e+00,
       -8.20122147e+00, -1.09877899e+02, -4.66463995e+00, -6.69718075e+00,
       -1.71611080e+01, -3.18311381e+00, -1.83578145e+04, -1.67979919e+02],
      dtype=float32)

In [18]:
Model.model.get_regression_coef()[2]

array([  -5.8458986, -470.16916  ,   -7.7924094,   -3.0099306,
         -5.2282624, -565.54224  ,   31.108414 , -399.84808  ,
         -1.2367842,  -13.726552 ,   -4.461956 , -540.00775  ,
        -36.306488 ,   -3.616269 ,   -5.0478735,   -2.1048298],
      dtype=float32)

In [19]:
Model.model.get_var_type(6)

'b'

In [20]:
Model.model.f

20

In [23]:
np.unique(list(user_labels.values()))

array(['atheism', 'autos', 'crypt', 'electronics', 'forsale', 'graphics',
       'med', 'motorcycles', 'os.ms-windows.misc', 'politics.guns',
       'politics.mideast', 'politics.misc', 'religion.christian',
       'religion.misc', 'space', 'sport.baseball', 'sport.hockey',
       'sys.ibm.pc.hardware', 'sys.mac.hardware', 'windows.x'],
      dtype='<U19')

### Test sLDA prediction accuracy on unprocessed dataset

In [24]:
from sklearn.datasets import fetch_20newsgroups

train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

In [25]:
np.unique(train.target)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [27]:
import tomotopy as tp
# mdl = tp.LDAModel(k=20)
mdl = tp.SLDAModel(k=16, vars=['b' for _ in range(20)], tw = tp.TermWeight.ONE, nu_sq= float('Inf'))

for line,label in zip(test.data, test.target):
    tgt = [0 for i in range(20)]
    tgt[label] = 1
    # mdl.add_doc(line.strip().split())
    mdl.add_doc(line.strip().split(), y=tgt)

for i in range(0, 1000, 10):
    mdl.train(10)
    if i % 500 == 0:
        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

# for k in range(mdl.k):
#     print('Top 10 words of topic #{}'.format(k))
#     print(mdl.get_topic_words(k, top_n=10))

mdl.summary()

Iteration: 0	Log-likelihood: -10.1851959941106
Iteration: 500	Log-likelihood: -8.990684810387462
<Basic Info>
| SLDAModel (current version: 0.12.4)
| 7532 docs, 2093344 words
| Total Vocabs: 201144, Used Vocabs: 201144
| Entropy of words: 8.52757
| Entropy of term-weighted words: 8.52757
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 1000, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -8.97208
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 16 (the number of topics between 1 ~ 32767)
| vars: binary, binary, binary, binary, binary, binary, binary, binary, binary, binary, binary, binary, binary, binary, binary, binary, binary, binary, binary, binary
| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length

In [28]:
'''
Let sLDA predict on unprecessed dataset
'''
raw_counter, raw_predictive_label, raw_test_predictive_label = slda_predict([ele.strip().split() for ele in train.data], [ele.strip().split() for ele in test.data], mdl)

In [30]:
'''
sLDA training accuracy
'''
accuracy_score(train.target, raw_predictive_label)

0.4353013964999116

In [31]:
'''
sLDA testing accuracy
'''
accuracy_score(test.target, raw_test_predictive_label)

0.5667817312798725

### Bertopic Model 

In [15]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

topic_model = BERTopic(nr_topics=20, top_n_words = 30, calculate_probabilities=True, verbose=True, embedding_model='all-distilroberta-v1')


  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [17]:
y_labels = [-1 for i in range(len(list_of_preprocessed_documents))]  

In [18]:
topics, probs = topic_model.fit_transform(list_of_preprocessed_documents, y=y_labels)

Batches: 100%|██████████| 324/324 [00:08<00:00, 38.46it/s] 
2023-08-08 14:38:47,727 - BERTopic - Transformed documents to Embeddings
2023-08-08 14:39:24,874 - BERTopic - Reduced dimensionality
2023-08-08 14:39:29,339 - BERTopic - Clustered reduced embeddings
2023-08-08 14:39:31,152 - BERTopic - Reduced number of topics from 99 to 20


In [19]:
probs[909]

array([0.02833728, 0.37075882, 0.01281318, 0.00866021, 0.00810526,
       0.05104246, 0.01669256, 0.03562784, 0.03344985, 0.00510101,
       0.02656709, 0.01641701, 0.05026322, 0.04662573, 0.02544039,
       0.03672484, 0.00223811, 0.00188472, 0.00135317])

In [20]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4213,-1_say_come_go_want,"[say, come, go, want, thing, work, problem, te...",[entire point exactly claim hear eye witness d...
1,0,1077,0_gun_patient_disease_year,"[gun, patient, disease, year, doctor, cause, f...",[believe way thread progress exchange word top...
2,1,711,1_drive_card_disk_monitor,"[drive, card, disk, monitor, driver, problem, ...",[hello folk xl internal tape drive pretty comp...
3,2,666,2_car_bike_ride_engine,"[car, bike, ride, engine, mile, buy, look, dri...",[motorcycle enthusiast motorcycle bluntly suck...
4,3,639,3_game_team_play_player,"[game, team, play, player, year, win, season, ...",[incident agree cross check illegal kind check...
5,4,574,4_believe_faith_belief_sin,"[believe, faith, belief, sin, man, say, mean, ...",[ assuredly grain wheat fall ground die remain...
6,5,537,5_key_encryption_chip_government,"[key, encryption, chip, government, phone, esc...",[note file available anonymous file transfer d...
7,6,372,6_launch_space_orbit_satellite,"[launch, space, orbit, satellite, mission, roc...",[archive space intro modify date series link...
8,7,306,7_mail_send_list_address,"[mail, send, list, address, request, post, tha...",[person run mail list mail detail mailing list...
9,8,205,8_point_line_polygon_file,"[point, line, polygon, file, colormap, plane, ...",[bad question ref list think bit hard point fi...


In [21]:
topic_distr, _ = topic_model.approximate_distribution(list_of_preprocessed_documents)

100%|██████████| 11/11 [00:07<00:00,  1.50it/s]


In [22]:
topic_distr.shape

(10347, 19)

In [23]:
topic_distr[1960]

array([0.        , 0.43592104, 0.01193062, 0.        , 0.        ,
       0.00572662, 0.        , 0.04349114, 0.        , 0.        ,
       0.4544945 , 0.0134909 , 0.03494518, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ])

In [24]:
topic_model.get_topic(0, full = True)

{'Main': [('gun', 0.018608386835689935),
  ('patient', 0.01743275333799233),
  ('disease', 0.01290044194610203),
  ('year', 0.012726293250272806),
  ('doctor', 0.012682641452660296),
  ('cause', 0.01251456182586965),
  ('food', 0.012296402645355276),
  ('case', 0.011933622537441721),
  ('pain', 0.010590655261799727),
  ('government', 0.010284915441536353),
  ('say', 0.01026404033893175),
  ('study', 0.010171841044794775),
  ('day', 0.010021503092837898),
  ('drug', 0.00998470869034499),
  ('find', 0.009657294249058266),
  ('treatment', 0.009504535282748357),
  ('thing', 0.009314463948016004),
  ('right', 0.009213916793178973),
  ('health', 0.009210106562463802),
  ('medical', 0.009198581628137494),
  ('law', 0.008988594683375283),
  ('get', 0.008799588741669983),
  ('report', 0.008673746003080195),
  ('problem', 0.008484690462858066),
  ('increase', 0.00847513716292813),
  ('effect', 0.008428104256636842),
  ('state', 0.008397951380458847),
  ('child', 0.008340615348941642),
  ('weapon

In [17]:
list_of_preprocessed_documents[1]

'lotsa stuff take line process serve peaceful attempt serve warrant occur'

In [18]:
topic_model.get_document_info(list_of_preprocessed_documents[1])

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,lotsa stuff take line process serve peaceful a...,-1,-1_work_problem_want_thing,"[work, problem, want, thing, car, find, try, n...",[sixteen day test drive finally th rain fact c...,work - problem - want - thing - car - find - t...,0.643380,False
1,lotsa stuff take line process serve peaceful a...,-1,-1_work_problem_want_thing,"[work, problem, want, thing, car, find, try, n...",[sixteen day test drive finally th rain fact c...,work - problem - want - thing - car - find - t...,0.830507,False
2,lotsa stuff take line process serve peaceful a...,-1,-1_work_problem_want_thing,"[work, problem, want, thing, car, find, try, n...",[sixteen day test drive finally th rain fact c...,work - problem - want - thing - car - find - t...,0.293723,False
3,lotsa stuff take line process serve peaceful a...,1,1_game_team_play_player,"[game, team, play, player, season, year, win, ...",[team beat team night show night dominate thor...,game - team - play - player - season - year - ...,0.179136,False
4,lotsa stuff take line process serve peaceful a...,-1,-1_work_problem_want_thing,"[work, problem, want, thing, car, find, try, n...",[sixteen day test drive finally th rain fact c...,work - problem - want - thing - car - find - t...,0.270575,False
...,...,...,...,...,...,...,...,...
10641,lotsa stuff take line process serve peaceful a...,0,0_believe_say_gun_thing,"[believe, say, gun, thing, come, patient, find...",[actually conflict understand passage faith st...,believe - say - gun - thing - come - patient -...,0.505005,False
10642,lotsa stuff take line process serve peaceful a...,-1,-1_work_problem_want_thing,"[work, problem, want, thing, car, find, try, n...",[sixteen day test drive finally th rain fact c...,work - problem - want - thing - car - find - t...,0.386419,False
10643,lotsa stuff take line process serve peaceful a...,-1,-1_work_problem_want_thing,"[work, problem, want, thing, car, find, try, n...",[sixteen day test drive finally th rain fact c...,work - problem - want - thing - car - find - t...,0.664208,False
10644,lotsa stuff take line process serve peaceful a...,8,8_point_color_line_polygon,"[point, color, line, polygon, file, graphic, c...",[hi write program convert dxf file database fo...,point - color - line - polygon - file - graphi...,1.000000,False


In [19]:
similar_topics, similarity = topic_model.find_topics("money", top_n=20)
topic_model.get_topic(similar_topics[0])

[('delete', 0.20144164566745362),
 ('stuff', 0.08488888874596974),
 ('bobbevice', 0.07717434713296814),
 ('sink', 0.06448111832319289),
 ('sea', 0.06396901877052559),
 ('blow', 0.05352661557768844),
 ('stay', 0.04887674039410372),
 ('post', 0.04793612956060226),
 ('away', 0.04762020184234488),
 ('go', 0.04052875319395337),
 ('oh', 0.039213749229049646),
 ('sig', 0.03496994287783963),
 ('say', 0.03455796287898625),
 ('get', 0.03441626116735986),
 ('way', 0.033202785488326524),
 ('answer', 0.03175012306633117),
 ('shameless', 0.030344650893960396),
 ('guess', 0.029329205130954536),
 ('to', 0.029145421713633057),
 ('let', 0.028765852925775624),
 ('woofing', 0.028479543140855556),
 ('bring', 0.02828374428319642),
 ('find', 0.0281381185740009),
 ('ask', 0.02794590634256966),
 ('add', 0.026981985275913244),
 ('holiday', 0.026416635456545247),
 ('mean', 0.02639013804925574),
 ('hell', 0.026213026980475278),
 ('merchant', 0.026169701597624297),
 ('happy', 0.025861281297846882)]