### Training a corpus-wide RMN with tfidf embeddings

In [1]:
import os
import sys
import pandas as pd

In [2]:
sys.path.append("/home/rocassius/w266_final/scripts/assembly")
sys.path.append("/home/rocassius/w266_final/scripts/modeling")

In [3]:
from document import load_documents
from constant import DOC_PRAYER_PATH, MIN_SESSION, MAX_SESSION, DOC_ALL_PATH
from subject import subject_keywords

sessions = list(range(MIN_SESSION, MAX_SESSION+1))

In [4]:
from helper import load_pickled_object
from rmn import *
from rmn_data_generator import RMN_DataGenerator
from rmn_analyzer import RMN_Analyzer

In [5]:
# load embedding tools
prayer_tools_path = "/home/rocassius/gen-data/tools/prayer_tools"
metadata_dict = load_pickled_object(os.path.join(prayer_tools_path, "metadata_dict"))
tokenizer_dict = load_pickled_object(os.path.join(prayer_tools_path, "tokenizer_dict"))
embedding_matrix = load_pickled_object(os.path.join(prayer_tools_path, "idf_embedding_matrix"))
global_embedding_matrix = load_pickled_object(os.path.join(prayer_tools_path, "embedding_matrix_wg"))
global_tokenizer_dict = load_pickled_object(os.path.join(prayer_tools_path, "tokenizer_dict_wg"))

In [6]:
embedding_matrix = embedding_matrix.astype(np.float16)

In [7]:
docs_df = load_documents(sessions, DOC_PRAYER_PATH)

In [8]:
docs_df.shape

(4057395, 10)

In [62]:
rmn = RigidRMN(dropout=0.5)
rmn.embedding_matrix = embedding_matrix
rmn.tokenizer_dict = tokenizer_dict
rmn.metadata_dict = metadata_dict
rmn.infer_embedding_matrix = global_embedding_matrix
rmn.infer_tokenizer_dict = global_tokenizer_dict
rmn.meta_embedding_dim = 25
rmn.num_topics = 50
rmn.build_model(gamma=0.1, theta=0., omega=.01, lamb=0.001, bias_reconstruct=False)

In [None]:
#rmn.model.summary()

In [73]:
data_df = docs_df

In [75]:
n_epochs = 2
n_folds = 10
fold_size = data_df.shape[0] // n_folds
index_folds = [list(range(i*fold_size, (i+1)*fold_size)) for i in range(n_folds)]

In [76]:
for _ in range(n_epochs):
    data_df = data_df.sample(frac=1)
    
    for fold in index_folds:
        g = RMN_DataGenerator(rmn=rmn, data_df=data_df.iloc[fold], batch_size=256)
        rmn.model.fit_generator(g, epochs = 1, use_multiprocessing=True, workers=10)

print("Success")

 318/1584 [=====>........................] - ETA: 42s - loss: 0.5309

Process Keras_worker_ForkPoolWorker-362:
Process Keras_worker_ForkPoolWorker-363:
Process Keras_worker_ForkPoolWorker-361:
Process Keras_worker_ForkPoolWorker-365:
Process Keras_worker_ForkPoolWorker-370:
Process Keras_worker_ForkPoolWorker-368:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process Keras_worker_ForkPoolWorker-367:
  File "/home/rocassius/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/rocassius/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()


KeyboardInterrupt: 

Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/rocassius/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/rocassius/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/rocassius/anaconda3/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/home/rocassius/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/rocassius/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/rocassius/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/rocassius/anaconda3/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    r

  File "/home/rocassius/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/utils/data_utils.py", line 571, in get_index
    return _SHARED_SEQUENCES[uid][i]
  File "/home/rocassius/anaconda3/lib/python3.7/site-packages/keras_preprocessing/text.py", line 310, in texts_to_sequences_generator
    self.split)
  File "/home/rocassius/anaconda3/lib/python3.7/site-packages/keras_preprocessing/text.py", line 312, in texts_to_sequences_generator
    for w in seq:
  File "/home/rocassius/anaconda3/lib/python3.7/site-packages/keras_preprocessing/text.py", line 279, in texts_to_sequences
    return list(self.texts_to_sequences_generator(texts))
  File "/home/rocassius/w266_final/scripts/modeling/rmn_data_generator.py", line 57, in __getitem__
    X, y = self.__data_generation(indices)
  File "/home/rocassius/anaconda3/lib/python3.7/site-packages/keras_preprocessing/text.py", line 310, in texts_to_sequences_generator
    self.split)
  File "/home/rocassius/anaconda3/lib/python3.7/si

  File "/home/rocassius/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3443, in _reindex_non_unique
    indexer, missing = self.get_indexer_non_unique(target)
  File "/home/rocassius/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 4801, in get_indexer_non_unique
    indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
KeyboardInterrupt


In [None]:
# data_generator = RMN_DataGenerator(rmn=rmn, data_df=docs_df.sample(1000000), batch_size=256)
# rmn.model.fit_generator(data_generator, 
#                         epochs = 3, 
#                         use_multiprocessing=True,                        
#                         workers=16)

In [77]:
local_models_path = "/home/rocassius/gen-data/models"

In [78]:
rmn.save_rmn("SuaveRanger", local_models_path)

In [None]:
rmn2 = RigidRMN(dropout=0.5)
rmn2.embedding_matrix = embedding_matrix
rmn2.tokenizer_dict = tokenizer_dict
rmn2.metadata_dict = metadata_dict
rmn2.infer_embedding_matrix = global_embedding_matrix
rmn2.infer_tokenizer_dict = global_tokenizer_dict
rmn2.meta_embedding_dim = 25
rmn2.num_topics = 50

In [110]:
name, save_path = "SuaveRanger", local_models_path

# make directory for model
model_path = os.path.join(save_path, RMN_TAG % name)

# load attributes
attributes_dict = load_pickled_object(os.path.join(model_path, ATTR))

# update attributes
rmn2.num_topics         = attributes_dict[N_TOP_KEY]
rmn2.embedding_matrix   = attributes_dict[EMBED_KEY]
rmn2.tokenizer_dict     = attributes_dict[TOKEN_KEY]
rmn2.metadata_dict      = attributes_dict[META_KEY]
rmn2.meta_embedding_dim = attributes_dict[DIM_KEY] 

# construct identical model architecture
rmn2.build_model(bias_reconstruct=False)

# Load weights
rmn2.model.load_weights(os.path.join(model_path, MODEL))

# build associated topic model
rmn2.build_topic_model()

In [111]:
analyzer = RMN_Analyzer(rmn2, docs_df.sample(100000))

In [112]:
analyzer.predict_topics()



In [114]:
np.nanmean(analyzer.shannon_entropy())

  return np.sum(p*-np.log2(p), axis=-1)


2.0080614

In [115]:
analyzer.topic_preds.round(3)[0]

array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.978, 0.   , 0.   ,
       0.   , 0.001, 0.   , 0.   , 0.006, 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.008, 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.004, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.003, 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   ], dtype=float32)

In [116]:
analyzer.first_topic_counts()

16    2780
42    2607
10    2371
22    2365
6     2327
30    2316
43    2299
35    2216
4     2209
9     2193
26    2155
19    2138
49    2135
17    2132
39    2115
32    2103
7     2098
27    2090
1     2089
23    2088
41    2079
31    2065
25    2061
47    2043
20    2036
21    2031
45    2024
44    2023
48    2020
37    2018
40    1962
3     1930
0     1888
33    1879
15    1876
34    1866
24    1825
18    1810
36    1753
12    1751
13    1746
29    1712
38    1703
11    1662
46    1628
14    1619
28    1608
8     1573
2     1543
5     1440
dtype: int64

In [117]:
analyzer.topic_use()

16    0.027276
42    0.025296
22    0.023962
6     0.023126
30    0.022582
10    0.022536
4     0.022408
43    0.022381
19    0.021943
49    0.021896
7     0.021591
35    0.021537
27    0.021488
9     0.021276
23    0.021145
26    0.020990
32    0.020901
41    0.020895
47    0.020802
39    0.020792
17    0.020744
25    0.020600
21    0.020141
40    0.020113
48    0.020037
20    0.019955
45    0.019948
18    0.019879
31    0.019712
3     0.019709
44    0.019581
37    0.019509
15    0.019087
0     0.019042
34    0.018883
33    0.018576
12    0.018515
24    0.018298
13    0.018270
36    0.017541
29    0.017303
14    0.017172
28    0.017029
1     0.016887
46    0.016885
8     0.016853
38    0.016647
11    0.016475
5     0.016470
2     0.015316
dtype: float32

In [None]:
rmn.infer

In [118]:
analyzer.rmn.inspect_topics(analyzer.topic_use().index)

  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))
  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))



Topic 16
['valera', 'mediators', 'yolanda', 'dickens', 'marianne', 'laurent', 'polly', 'ziglar', 'irishmen', 'kennard']

Topic 42
['astronomical', 'montebello', 'generational', 'calif', 'mcgrory', 'dietitian', 'oceanographer', 'monnet', 'fla', 'neb']

Topic 22
['educator', 'theologian', 'jurist', 'lauded', 'fam', 'outspoken', 'chaired', 'eileen', 'critic', 'mullen']

Topic 6
['pruning', 'reconstructive', 'pinches', 'trimming', 'recertification', 'borlaug', 'gnawing', 'mastectomy', 'convalescing', 'premed']

Topic 30
['lingo', 'envelope', 'margie', 'etiquette', 'playbook', 'tic', 'eileen', 'planner', 'email', 'ibe']

Topic 10
['oef', 'ariz', 'exp', 'warming', 'rev', 'oly', 'stresses', 'accelerating', 'minn', 'emissions']

Topic 4
['griffiss', 'drinker', 'supporter', 'backer', 'chronicler', 'carswell', 'statesman', 'solomons', 'hm', 'puritan']

Topic 43
['theologian', 'yers', 'jurist', 'essayist', 'reformer', 'omi', 'isaacs', 'forecaster', 'biscet', 'grigory']

Topic 19
['infallible', '