In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import sys
sys.path.append('/content/drive/MyDrive/activelearning')

In [1]:
from Topic_Models.topic_model import Topic_Model
import pandas as pd
from flask_app.classifier import Active_Learning
from sklearn.feature_extraction.text import TfidfVectorizer
from Topic_Models.Neural_Topic_Model import Neural_Model
import pickle
from multiprocessing import Process, Manager
import copy
from scipy.sparse import csr_matrix
import numpy as np

In [2]:
'''
The processed document path and the raw documents path
'''

doc_dir = './Topic_Models/Data/newsgroups/newsgroup_train.json'
processed_doc_dir = './Topic_Models/Data/newsgroups/newsgroup_train_processed.pkl'

'''
Mapping the mode numbers to which model we use
LA: active learning baseline
'''
model_types_map = {0: 'LA' , 1: 'LDA', 2: 'SLDA', 3: 'ETM', 4: 'CTM', 7: 'LLDA', 8: 'PLDA', 9: 'Bertopic'}

'''
num_iter: number of iterations to run when updating sLDA model
load_data: If use load_data, a trained topic model would be loaded.
'''
num_iter = 1000
load_data = True
save_model = False

'''
Enter the number of topics for the model you just trained
'''
num_topics =16
inference_alg = 'logreg'
test_dataset_name = './Topic_Models/Data/newsgroups/newsgroup_test.json'

'''
Keep those and don't change
'''
USE_TEST_DATA = True
USE_PROCESSED_TEXT = False
CONCATENATE_KEYWORDS = True
table = pd.read_json(doc_dir)
training_length = len(table)
REGRESSOR_PREDICT = True
mode = 2

In [3]:
model = Topic_Model(num_topics, 0, model_types_map[mode], processed_doc_dir, training_length, {}, True, './Topic_Models/Model/{}_{}.pkl'.format(model_types_map[mode], num_topics))
# model = Neural_Model('./Model/ETM_{}.pkl'.format(num_topics), processed_doc_dir, doc_dir)

In [6]:
'''
find_doc_for_TA: Given a dictionary, where the keys are topic numbers
the values are list of tuples of (document_id, probability), first find
the topic that has the maximum median probability, then finds the document
within that topic that has the maximum probaiblity
'''
from flask_app.utils.tools import find_doc_for_TA
import numpy as np

In [7]:
'''
The document topic distribution for the topic model
'''
document_probas = model.document_probas

In [8]:
'''
The result is a N X K matrix. N is the number of
documents, K is the number of topics
'''
np.shape(model.doc_topic_probas)

(10347, 16)

In [9]:
def count_docs(dictionary):
    result = 0
    for k, v in dictionary.items():
        result += len(v)

    return result

def update_median_prob(topic_num, idx_in_topic, doc_probs):
    try:
        doc_probs[topic_num].pop(idx_in_topic)
    except:
          doc_probs.pop(topic_num)

def print_topic_lens(dictionary):
    for k, v in dictionary.items():
        print('topic {} num docs {}'.format(k, len(v)))

In [10]:
%%capture
print('total num docs', count_docs(document_probas))

recommended_docs = []
recommended_topics = []
# Since the length of the document probas change, np.ones length should also change
for i in range(100):
    chosen_idx, chosen_topic, chosen_idx_in_topic = find_doc_for_TA(document_probas, np.ones(training_length))
    update_median_prob(chosen_topic, chosen_idx_in_topic, document_probas)
    # print_topic_lens(document_probas)
    print('doc id is ', chosen_idx)
    # chosen_topic
    recommended_topics.append(chosen_topic)
    recommended_docs.append(chosen_idx)

In [11]:
'''
Run a hundred iterations to pick the document that has the maximum median probability in the topic.
Check the distribution of topics being picked. This is a good way to check topic diversity first
'''

from collections import Counter
Counter(recommended_topics)

Counter({3: 23, 14: 26, 2: 51})

In [25]:
import numpy as np
def save_results(lst, file_name):
    np.save(file_name, lst)

In [26]:
'''
Active learning baseline to test the metrics of clf without topic modeling
'''
def calculate_activelearning_diversity(process=False, unigram=False):
    accs, purity, ri, nmi = [], [], [], []
    test_accs, test_purity, test_ri, test_nmi = [], [], [], []
    test_df = pd.read_json(test_dataset_name)
    # print('test length is', len(test_df))

    '''
    Read the documents and the labels
    '''
    df = pd.read_json(doc_dir)
    labels = df.sub_labels.values.tolist()

    raw_texts = df.text.values.tolist()[0:training_length]
    test_texts = test_df.text.values.tolist()
    raw_texts.extend(test_texts)


    '''
    Tried to test the performance of unigram and bigrams, the performance of bigram vectorizer is better
    '''
    if unigram == True:
        vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(1, 1))
    else:
        vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(1, 2))
    '''
    If process is True, load the processed data and use processed data as inputs for the clf
    else, just use the raw texts to as input for the clf. Results indicate that using raw texts
    has higher performance than using processed data
    '''
    if not process:
        vectorizer_idf = vectorizer.fit_transform(raw_texts)
    else:
        with open(processed_doc_dir, 'rb') as inp:
            loaded_train_data = pickle.load(inp)
            processed_train_data = loaded_train_data['datawords_nonstop']


        text_data = [' '.join(doc) for doc in processed_train_data]
        # from sklearn.feature_extraction.text import CountVectorizer
        # vectorizer = CountVectorizer(binary=True)
        vectorizer_idf = vectorizer.fit_transform(text_data)


    '''
    Initialize the active learning classifier
    '''
    alto = Active_Learning(raw_texts, None,  None, df, inference_alg, vectorizer_idf, training_length, 0, test_df)

    recommended_docs1 = []
    recommended_topics1 = []
    scores = []

    '''
    Label 300 hundred documents and save the three metrics
    '''
    for i in range(300):
        '''
        Get the recommended document id and the score (entropy) for the current document
        '''
        recommend_id, score = alto.recommend_document(True)

        alto.label(recommend_id, labels[recommend_id])
        recommended_docs1.append(recommend_id)
        recommended_topics1.append(alto.last_recommended_topic)
        scores.append(score)
        # print(raw_texts[recommend_id])

        '''
        clf only works for at least 2 distinct classes
        '''
        if len(alto.classes) >= 3:
            train_acc, b, p, r, n, e, f, g = alto.eval_classifier()
            accs.append(train_acc)
            purity.append(p)
            ri.append(r)
            nmi.append(n)
            test_accs.append(b)
            test_purity.append(e)
            test_ri.append(f)
            test_nmi.append(g)

    return accs, purity, ri, nmi, test_accs, test_purity, test_ri , test_nmi

In [33]:
'''
Evaluate the performance of activalearning clf with topic model features added to the clf
'''
def calculate_topic_diversity(module, concat, Model=None):

    accs, purity, ri, nmi = [], [], [], []
    test_accs, test_purity, test_ri, test_nmi = [], [], [], []
    test_df = pd.read_json(test_dataset_name)
    # print('test length is', len(test_df))

    if Model is None:
        if module == 3:
            model = Neural_Model('./Topic_Models/Model/ETM_{}.pkl'.format(num_topics), processed_doc_dir, doc_dir)
        elif module == 4:
            model = Neural_Model('./Topic_Models/Model/CTM_{}.pkl'.format(num_topics), processed_doc_dir, doc_dir)
        elif module == 9:
            model = Neural_Model('./Topic_Models/Model/Bertopic_{}.pkl'.format(num_topics), processed_doc_dir, doc_dir)
        else:
            model = Topic_Model(num_topics, 0, model_types_map[module], processed_doc_dir, training_length, {}, True, './Topic_Models/Model/{}_{}.pkl'.format(model_types_map[module], num_topics))
    else:
        '''
        If no module is selected, then use the Model passed in in the parameter
        '''
        model = Model

    '''
    Read texts, construct a text encoding for the clf
    '''
    df = pd.read_json(doc_dir)
    labels = df.sub_labels.values.tolist()
    raw_texts = df.text.values.tolist()[0:training_length]
    document_probas, doc_topic_probas = model.group_docs_to_topics()
    vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(1,2))
    test_texts = test_df.text.values.tolist()

    '''
    If concatenation is true, concatenate the topic distribution with the text features
    '''
    if concat == True:

        vectorizer_idf = vectorizer.fit_transform(raw_texts)

        '''
        concatenate the features from topic model with the clf
        '''
        concatenated_features = model.concatenate_features(vectorizer_idf)

        vectorizer_idf = concatenated_features
    else:
        model_concate_texts = raw_texts
        vectorizer_idf = vectorizer.fit_transform(model_concate_texts)



    alto = Active_Learning(raw_texts, copy.deepcopy(document_probas), doc_topic_probas, df, inference_alg, vectorizer_idf, training_length, 1, test_df)

    print('start synthetic labeling')
    recommended_docs1 = []
    recommended_topics1 = []
    scores = []

    '''
    Label 300 documents
    '''
    for i in range(300):
        recommend_id, score = alto.recommend_document(True)



        alto.label(recommend_id, labels[recommend_id])
        recommended_docs1.append(recommend_id)
        recommended_topics1.append(alto.last_recommended_topic)
        scores.append(score)
        # print(raw_texts[recommend_id])
        # print(alto.user_labels)
        if len(alto.classes) >= 3:
            train_acc, b, p, r, n, e, f, g = alto.eval_classifier()
            accs.append(train_acc)
            purity.append(p)
            ri.append(r)
            nmi.append(n)
            test_accs.append(b)
            test_purity.append(e)
            test_ri.append(f)
            test_nmi.append(g)

        '''
        Update and retrain supervised LDA based on available labels. set i%N == 0, where N
        is the number of documents you labled before you want to update sLDA
        '''
        if module == 2 and i != 0 and i % 500 == 0:
            model = Topic_Model(num_topics, num_iter, model_types_map[module], processed_doc_dir, training_length, alto.user_labels, False, None)
            model.train('./Topic_Models//Model/SLDA_test.pkl')
            model = Topic_Model(num_topics, 0, model_types_map[module], processed_doc_dir, training_length, {}, True, './Topic_Models//Model/SLDA_test.pkl')
            document_probas, doc_topic_probas = model.group_docs_to_topics()
            alto.update_doc_probs(copy.deepcopy(document_probas), doc_topic_probas)
            vectorizer_idf = vectorizer.fit_transform(raw_texts)
            '''
            concatenate the features from topic model with the classifier
            '''
            concatenated_features = model.concatenate_features(vectorizer_idf)



            vectorizer_idf = concatenated_features
            alto.update_text_vectorizer(vectorizer_idf)


    return recommended_topics1, accs, purity, ri, nmi, test_accs, test_purity, test_ri , test_nmi

In [None]:
def run_experiment_and_save(module, save_path):
  '''
  module: model_types_map = {0: 'LA' , 1: 'LDA', 2: 'SLDA', 3: 'ETM', 4: 'CTM', 7: 'LLDA', 8: 'PLDA', 9: 'Bertopic'}
  Enter the module number to run the the model
  '''

  if module == 0:
    a0, p0, r0, n0, ta0, tp0, tr0, tn0 = calculate_activelearning_diversity(False, False)
    result_lst = [a0, p0, r0, n0, ta0, tp0, tr0, tn0]
    save_results(result_lst, save_path)
  else:
    recommended_topics1, a1, p1, r1, n1, ta1, tp1, tr1, tn1 = calculate_topic_diversity(module, True)
    result_lst = [a1, p1, r1, n1, ta1, tp1, tr1, tn1]
    print('purity list length is ', len(p1))
    save_results(result_lst, save_path)


In [None]:
%%capture
'''
Run the LDA experiment. The saved results can be plotted in the new_mode_plot.ipynb notebook
'''
run_experiment_and_save(1, './model_testing_results/16_topics_LDA.npy')

In [None]:
%%capture
'''
Run the sLDA experiment
'''
run_experiment_and_save(2, './model_testing_results/16_topics_SLDA.npy')

In [None]:
%%capture
'''
Run the ETM experiment
'''
run_experiment_and_save(3, './model_testing_results/16_topics_ETM.npy')

In [None]:
%%capture
'''
Run the CTM experiment
'''
run_experiment_and_save(4, './model_testing_results/16_topics_CTM.npy')

[]
I've got a probelm with printing envelops on my deskjet 550C from Word
for Windows.

History: when I had a Deskjet 500, I had to modify the macro so that
it would disable reverse printing since there seemed to be a bug that
interferred with printing envelops.  That bug is still present, however...

Now: the problem I have is that my DeskJet 550 print driver (came with
the printer I bought in December) still wants to print the envelops in
PORTRAIT mode.  However, the DeskJet 550 feeds envelops the from the
narrow end (i.e. landscape mode).  How do I get the printer to print
the envelops in the correct orientation?

PLEASE PLEASE PLEASE do not post this to the net since:
    (a) due to end of the semester cruch, I can't keep up on these groups,
and (b) I will summarize the best answer(s) to the net anyhow!
['os.ms-windows.misc']
# ## # Gay men constitute at least 20% of all child molestations.  Whether
# ## # this is because gay molesters are unusually common, or have unusually
# ## #

In [None]:
%%capture
'''
Run the PLDA experiment
'''
run_experiment_and_save(8, './model_testing_results/16_topics_PLDA.npy')

In [None]:
%%capture
'''
Run the Bertopic experiment
'''
run_experiment_and_save(9, './model_testing_results/16_topics_BERTOPIC.npy')

### Fetch word priors from the topic model. Possibly can be used as features for the classifier

In [35]:
from topic_model import Topic_Model
full_len = len(pd.read_json('./Data/congressional_bill_train.json'))
Model = Topic_Model(13, 2500, 'LDA', './Data/congressional_bill_train_processed.pkl', full_len , {}, False, None)
save_path = './Model/{}_{}.pkl'.format('LDA', 13)
Model.train(save_path)

ModuleNotFoundError: ignored

In [None]:
len(Model.model.used_vocabs)

9306

In [None]:
Model.model.used_vocabs



In [None]:
for word in Model.model.used_vocabs:
    prior = Model.model.get_word_prior(word)
    if len(prior) > 0:
        print(word)

In [None]:
Model.model.tw

0

In [None]:
model_vocabs = set()
for i in Model.data_words_nonstop:
    for j in i:
        model_vocabs.add(j)

len(model_vocabs)

9306

In [None]:
from gensim import corpora, models

# Example corpus of documents (list of lists of tokens)
with open('./Data/congressional_bill_train_processed.pkl', 'rb') as inp:
    data = pickle.load(inp)
    corpus = data['datawords_nonstop']

# Create a dictionary from the corpus
dictionary = corpora.Dictionary(corpus)

# Create a bag-of-words representation of the corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

# Train an LDA model on the bag-of-words corpus
lda_model = models.LdaModel(bow_corpus, num_topics=13, id2word=dictionary, passes=10)



Word Prior Probability for 'college': 0.0000


In [None]:
def get_prior(target_word):
    # The word for which you want to get the word prior probability
    # target_word = "veteran"

    # Get the word prior probability for the target word
    word_id = dictionary.token2id[target_word]
    word_prior_prob = sum(prob for _, prob in lda_model.get_term_topics(word_id))

    print(f"Word Prior Probability for '{target_word}': {word_prior_prob:.4f}")

In [None]:
for word in corpus[0]:
    get_prior(word)

Word Prior Probability for 'authorize': 0.0886
Word Prior Probability for 'study': 0.0000
Word Prior Probability for 'feasibility': 0.0000
Word Prior Probability for 'desirability': 0.0000
Word Prior Probability for 'establish': 0.0602
Word Prior Probability for 'national': 0.0994
Word Prior Probability for 'recreation': 0.0000
Word Prior Probability for 'area': 0.0113
Word Prior Probability for 'know': 0.0000
Word Prior Probability for 'santa': 0.0000
Word Prior Probability for 'margarita': 0.0000
Word Prior Probability for 'national': 0.0994
Word Prior Probability for 'recreation': 0.0000
Word Prior Probability for 'area': 0.0113
Word Prior Probability for 'area': 0.0113
Word Prior Probability for 'san': 0.0000
Word Prior Probability for 'diego': 0.0000
Word Prior Probability for 'county': 0.0145
Word Prior Probability for 'calif': 0.0000
Word Prior Probability for 'presently': 0.0000
Word Prior Probability for 'constitute': 0.0000
Word Prior Probability for 'camp': 0.0000
Word Prior