In [13]:
from parser import OpenCorporaParser
from artm import ARTM
from tools import get_pointwise_mutual_information
import numpy as np
import scipy.sparse as sparse
from combined_smoothing_sparsing_regularizer import CombinedSmoothingSparsingRegularizer
from covariance_docs_regularizer import CovarianceDocsRegularizer
from covariance_topics_regularizer import CovarianceTopicsRegularizer

In [2]:
open_corpora_parser = OpenCorporaParser()

doc_term_matr, vocabulary, year, topic, close_word_pairs = open_corpora_parser.parse_open_corpora(
    path_to_corpus='annot.opcorpora.no_ambig.xml')

In [3]:
def get_words_list(vocabulary):
    return list(sorted(vocabulary, key=lambda word: vocabulary[word]))

In [4]:
word_in_doc_freqs = doc_term_matr.transpose()

In [5]:
perm = np.random.permutation(word_in_doc_freqs.shape[1])
word_in_doc_freqs_hold_out = sparse.dok_matrix(word_in_doc_freqs[:, perm[:int(len(perm)/10)]])
word_in_doc_freqs_train = sparse.dok_matrix(word_in_doc_freqs[:, perm[int(len(perm)/10):]])

In [14]:
words_count = word_in_doc_freqs.shape[0]
docs_count = word_in_doc_freqs_train.shape[1]

In [29]:
# baseline

topics_count = 100

plsa_model = ARTM(topics_count=topics_count, regularizers=[], regularizer_weights=[])

#TODO: plot convergence to show that EM is implemented correctly.

train_result = plsa_model.train(word_in_doc_freqs=word_in_doc_freqs_train,
                                words_list=get_words_list(vocabulary),
                                iterations_count=20, verbose=True)

iter#1: loglike=-232112.48143368118
iter#2: loglike=-230232.8404892832
iter#3: loglike=-227362.27818222516
iter#4: loglike=-222613.65348712401
iter#5: loglike=-215438.47972231676
iter#6: loglike=-206765.97311331515
iter#7: loglike=-198582.00317570867
iter#8: loglike=-192135.64972847133
iter#9: loglike=-187457.68543131973
iter#10: loglike=-184091.1059478081
iter#11: loglike=-181599.7854908576
iter#12: loglike=-179691.974017075
iter#13: loglike=-178188.9154804567
iter#14: loglike=-176971.79742480125
iter#15: loglike=-175970.21378539843
iter#16: loglike=-175137.35283748427
iter#17: loglike=-174431.24873664175
iter#18: loglike=-173827.72378696981
iter#19: loglike=-173310.64079053563
iter#20: loglike=-172861.14891889793


In [30]:
train_result.get_train_perplexity()

173.66173726035473

In [31]:
train_result.get_holdout_perplexity(holdout_word_in_doc_freqs=word_in_doc_freqs_hold_out, iterations_count=20, 
                                    verbose=True)

iter#1: loglike=-13577.979193660929
iter#2: loglike=-13133.387173448176
iter#3: loglike=-12964.22661162221
iter#4: loglike=-12886.070168487826
iter#5: loglike=-12844.230684617056
iter#6: loglike=-12819.189697585627
iter#7: loglike=-12802.950746503097
iter#8: loglike=-12791.78793750208
iter#9: loglike=-12783.725741634493
iter#10: loglike=-12777.667438810977
iter#11: loglike=-12772.963875395495
iter#12: loglike=-12769.216052119104
iter#13: loglike=-12766.183871820498
iter#14: loglike=-12763.726310228576
iter#15: loglike=-12761.74622424324
iter#16: loglike=-12760.155389606374
iter#17: loglike=-12758.870233945256
iter#18: loglike=-12757.820012154401
iter#19: loglike=-12756.950285298353
iter#20: loglike=-12756.220668710263


(292.81129219999019, 9421.1232441476968)

In [10]:
close_word_pairs_hold_out = np.array(close_word_pairs)[perm[:int(len(perm)/10)]]
close_word_pairs_train = np.array(close_word_pairs)[perm[int(len(perm)/10):]]

In [11]:
pmi = get_pointwise_mutual_information(word_in_doc_freqs_train, close_word_pairs_train)

train_result.get_pointwise_mutual_information_metric(pmi)

0.6197615636322461

In [12]:
pmi = get_pointwise_mutual_information(word_in_doc_freqs_hold_out, close_word_pairs_hold_out)

train_result.get_pointwise_mutual_information_metric(pmi)

0.47958504916211342

In [18]:
similarity_docs_matrix = (np.array(topic)[:, None] == np.array(topic)).astype(np.float)

In [25]:
regularizers = [CombinedSmoothingSparsingRegularizer(beta_0=0.5, alpha_0=0.5, 
                                                     beta=np.array([1e-4]*words_count), 
                                                     alpha=np.array([1e-4]*topics_count), 
                                                     num_topics=topics_count, 
                                                     num_words=words_count, 
                                                     num_docs=docs_count, 
                                                     domain_specific_topics=np.arange(80), 
                                                     background_topics=np.arange(80, 100)), 
                CovarianceTopicsRegularizer(tau=1.0, num_topics=topics_count, 
                                            num_words=words_count, num_docs=docs_count), 
                CovarianceDocsRegularizer(tau=1.0, num_topics=topics_count, num_words=words_count, 
                                          num_docs=docs_count, 
                                          similarity_docs_matrix=similarity_docs_matrix)
               ]

regularizer_weights = [1e-1, 1e-1, 1e-1]

In [32]:
regularized_model = ARTM(topics_count=topics_count, regularizers=regularizers, 
                         regularizer_weights=regularizer_weights)

#TODO: plot convergence to show that EM is implemented correctly.

train_result_reg = plsa_model.train(word_in_doc_freqs=word_in_doc_freqs_train,
                                words_list=get_words_list(vocabulary),
                                iterations_count=20, verbose=True)

iter#1: loglike=-232097.57402491051
iter#2: loglike=-230210.0123460084
iter#3: loglike=-227314.90060808152
iter#4: loglike=-222531.0125606625
iter#5: loglike=-215292.01246277124
iter#6: loglike=-206470.65803747554
iter#7: loglike=-198145.3977520377
iter#8: loglike=-191663.3660065342
iter#9: loglike=-187052.52492047072
iter#10: loglike=-183791.55683082246
iter#11: loglike=-181413.45129180324
iter#12: loglike=-179609.92609994358
iter#13: loglike=-178177.45528038283
iter#14: loglike=-176998.13499583583
iter#15: loglike=-176013.33252167297
iter#16: loglike=-175186.8882545553
iter#17: loglike=-174483.8637188816
iter#18: loglike=-173872.17297806047
iter#19: loglike=-173335.2059342725
iter#20: loglike=-172861.86549992667


In [27]:
train_result_reg.get_train_perplexity()

173.22899871268055

In [28]:
train_result_reg.get_holdout_perplexity(holdout_word_in_doc_freqs=word_in_doc_freqs_hold_out, iterations_count=20, 
                                    verbose=True)

iter#1: loglike=-13571.407282588487
iter#2: loglike=-13101.715733512685
iter#3: loglike=-12922.302287431212
iter#4: loglike=-12838.774707915933
iter#5: loglike=-12795.158369479012
iter#6: loglike=-12770.654297358804
iter#7: loglike=-12755.954379387282
iter#8: loglike=-12746.517112510033
iter#9: loglike=-12740.087927846322
iter#10: loglike=-12735.500001148379
iter#11: loglike=-12732.11155544102
iter#12: loglike=-12729.545671104857
iter#13: loglike=-12727.565083455342
iter#14: loglike=-12726.0112674667
iter#15: loglike=-12724.7740127897
iter#16: loglike=-12723.774874688528
iter#17: loglike=-12722.957158098669
iter#18: loglike=-12722.279419429737
iter#19: loglike=-12721.711087176976
iter#20: loglike=-12721.2294177277


(288.28482982420098, 8965.3214380604659)

In [None]:
pmi = get_pointwise_mutual_information(word_in_doc_freqs_train, close_word_pairs_train)

train_result_reg.get_pointwise_mutual_information_metric(pmi)

In [None]:
pmi = get_pointwise_mutual_information(word_in_doc_freqs_hold_out, close_word_pairs_hold_out)

train_result_reg.get_pointwise_mutual_information_metric(pmi)